# Manipulate Data

In [1]:
import pandas as pd

In [2]:
car_sales = pd.read_csv("_data/car-sales.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [3]:
car_sales["Make"] = car_sales["Make"].str.lower()
car_sales["Make"]

0    toyota
1     honda
2    toyota
3       bmw
4    nissan
5    toyota
6     honda
7     honda
8    toyota
9    nissan
Name: Make, dtype: object

In [4]:
car_sales_missing = pd.read_csv("_data/car-sales-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [6]:
car_sales_missing_odometer = car_sales_missing[car_sales_missing["Odometer (KM)"].isnull() == True]
car_sales_missing_odometer.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
7,Honda,White,,4.0,20306.0
37,Nissan,White,,4.0,15131.0
72,Honda,White,,4.0,5648.0
86,Nissan,Blue,,4.0,3300.0
129,Toyota,White,,4.0,35358.0


Sometimes data is missing in our data frame and we have to either fill it or remove those rows to make our research consistent. We can do this with `fillna` and `dropna`

In [7]:
car_sales_missing["Odometer (KM)"].mean()

131253.23789473684

In [8]:
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)
car_sales_missing.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,131253.237895,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [9]:
car_sales_missing.dropna()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
994,BMW,Blue,163322.0,3.0,31666.0
995,Toyota,Black,35820.0,4.0,32042.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [10]:
car_sales_missing.drop("Doors", axis=1)

Unnamed: 0,Make,Colour,Odometer (KM),Price
0,Honda,White,35431.0,15323.0
1,BMW,Blue,192714.0,19943.0
2,Honda,White,84714.0,28343.0
3,Toyota,White,154365.0,13434.0
4,Nissan,Blue,181577.0,14043.0
...,...,...,...,...
995,Toyota,Black,35820.0,32042.0
996,,White,155144.0,5716.0
997,Nissan,Blue,66604.0,31570.0
998,Honda,White,215883.0,4001.0


In [11]:
car_sales_missing["Total Fuel Used (L)"] = pd.Series([40, 50, 35, 25])
car_sales_missing["Total Fuel Used (L)"].fillna(car_sales_missing["Total Fuel Used (L)"].mean(), inplace=True)
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Total Fuel Used (L)
0,Honda,White,35431.0,4.0,15323.0,40.0
1,BMW,Blue,192714.0,5.0,19943.0,50.0
2,Honda,White,84714.0,4.0,28343.0,35.0
3,Toyota,White,154365.0,4.0,13434.0,25.0
4,Nissan,Blue,181577.0,3.0,14043.0,37.5
...,...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0,37.5
996,,White,155144.0,3.0,5716.0,37.5
997,Nissan,Blue,66604.0,4.0,31570.0,37.5
998,Honda,White,215883.0,4.0,4001.0,37.5


When working with ordered it can be necessary or useful to change the order of our data to avoid patterns in our research or in our machine learning models. We can shuffle the rows of our data frame with the sample method.


The `.sample` method takes a `frac` parameter which specifies which fraction of the data to shuffle (1 = 100%). This is especially useful when working with really large data sets.

In [15]:
car_sales_shuffled = car_sales_missing.sample(frac=1)
car_sales_shuffled

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Total Fuel Used (L)
365,Nissan,White,24420.0,4.0,14198.0,37.5
902,BMW,White,84327.0,5.0,38233.0,37.5
470,Toyota,Green,119394.0,4.0,11773.0,37.5
426,Toyota,White,146307.0,4.0,13508.0,37.5
670,,White,138294.0,4.0,11626.0,37.5
...,...,...,...,...,...,...
13,,White,134181.0,4.0,11121.0,37.5
604,Toyota,White,26655.0,4.0,11749.0,37.5
826,Honda,White,30080.0,4.0,18429.0,37.5
512,Nissan,Blue,38233.0,4.0,23483.0,37.5


In [17]:
car_sales_shuffled.reset_index(drop=True)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Total Fuel Used (L)
0,Nissan,White,24420.0,4.0,14198.0,37.5
1,BMW,White,84327.0,5.0,38233.0,37.5
2,Toyota,Green,119394.0,4.0,11773.0,37.5
3,Toyota,White,146307.0,4.0,13508.0,37.5
4,,White,138294.0,4.0,11626.0,37.5
...,...,...,...,...,...,...
995,,White,134181.0,4.0,11121.0,37.5
996,Toyota,White,26655.0,4.0,11749.0,37.5
997,Honda,White,30080.0,4.0,18429.0,37.5
998,Nissan,Blue,38233.0,4.0,23483.0,37.5


We can manipulate columns by using lambdas or other functions with the `.apply` method. In this case we are chaning the Odometer from KM to miles.

In [30]:
car_sales_missing["Odometer (KM)"].apply(lambda distance: distance / 1.6)
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Total Fuel Used (L)
0,Honda,White,35431.0,4.0,15323.0,40.0
1,BMW,Blue,192714.0,5.0,19943.0,50.0
2,Honda,White,84714.0,4.0,28343.0,35.0
3,Toyota,White,154365.0,4.0,13434.0,25.0
4,Nissan,Blue,181577.0,3.0,14043.0,37.5
