## Sample, Reset Index and Apply

In [1]:
import pandas as pd

car_sales = pd.read_csv('../car-sales.csv')

## Sample
We use sample to randomize our dataframe (or part of it). It prevents our ML Model finding patterns

In [2]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [3]:
# Using sample in 100% of data
car_sales_totally_shuffled = car_sales.sample(frac=1)
car_sales_totally_shuffled

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
8,Toyota,White,60000,4,"$6,250.00"
1,Honda,Red,87899,4,"$5,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
9,Nissan,White,31600,4,"$9,700.00"
4,Nissan,White,213095,4,"$3,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
3,BMW,Black,11179,5,"$22,000.00"
7,Honda,Blue,54738,4,"$7,000.00"
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"


In [4]:
# Using sample in 20% of data
# OBS: We choose to use functions in a small fraction of our data because if our computer is not that powerful,
# running functions for each row can be really slow when applying our functions in 100% of our dataframe.

car_sales_partially_shuffled = car_sales.sample(frac=0.2)
car_sales_partially_shuffled

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
9,Nissan,White,31600,4,"$9,700.00"
2,Toyota,Blue,32549,3,"$7,000.00"


## Reset Index
If we want to reset our index to the original state

In [5]:
car_sales_totally_shuffled

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
8,Toyota,White,60000,4,"$6,250.00"
1,Honda,Red,87899,4,"$5,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
9,Nissan,White,31600,4,"$9,700.00"
4,Nissan,White,213095,4,"$3,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
3,BMW,Black,11179,5,"$22,000.00"
7,Honda,Blue,54738,4,"$7,000.00"
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"


In [6]:
car_sales_totally_shuffled.reset_index()  # We can use inplace=True parameter to update our variable

Unnamed: 0,index,Make,Colour,Odometer (KM),Doors,Price
0,8,Toyota,White,60000,4,"$6,250.00"
1,1,Honda,Red,87899,4,"$5,000.00"
2,5,Toyota,Green,99213,4,"$4,500.00"
3,9,Nissan,White,31600,4,"$9,700.00"
4,4,Nissan,White,213095,4,"$3,500.00"
5,6,Honda,Blue,45698,4,"$7,500.00"
6,3,BMW,Black,11179,5,"$22,000.00"
7,7,Honda,Blue,54738,4,"$7,000.00"
8,0,Toyota,White,150043,4,"$4,000.00"
9,2,Toyota,Blue,32549,3,"$7,000.00"


In [7]:
car_sales_partially_shuffled.reset_index(inplace=True)
car_sales_partially_shuffled

Unnamed: 0,index,Make,Colour,Odometer (KM),Doors,Price
0,9,Nissan,White,31600,4,"$9,700.00"
1,2,Toyota,Blue,32549,3,"$7,000.00"


## Apply
We can apply custom functions to our dataframe

In [8]:
# Let's say that we want to convert our Odometer KM to miles
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [9]:
car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x / 1.6)
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,93776.875,4,"$4,000.00"
1,Honda,Red,54936.875,4,"$5,000.00"
2,Toyota,Blue,20343.125,3,"$7,000.00"
3,BMW,Black,6986.875,5,"$22,000.00"
4,Nissan,White,133184.375,4,"$3,500.00"
5,Toyota,Green,62008.125,4,"$4,500.00"
6,Honda,Blue,28561.25,4,"$7,500.00"
7,Honda,Blue,34211.25,4,"$7,000.00"
8,Toyota,White,37500.0,4,"$6,250.00"
9,Nissan,White,19750.0,4,"$9,700.00"


In [10]:
# Now, let's say that we want to have a big profit from our car sales, and
# for that we will double the price

# First, let's convert the Price column from string to integer
car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '', regex=True).astype(int)
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,93776.875,4,400000
1,Honda,Red,54936.875,4,500000
2,Toyota,Blue,20343.125,3,700000
3,BMW,Black,6986.875,5,2200000
4,Nissan,White,133184.375,4,350000
5,Toyota,Green,62008.125,4,450000
6,Honda,Blue,28561.25,4,750000
7,Honda,Blue,34211.25,4,700000
8,Toyota,White,37500.0,4,625000
9,Nissan,White,19750.0,4,970000


In [None]:
# Doubling the Price
car_sales["Price"]