First, you will download a copy of the used cars data set. 

You can download this from here:
[UsedCars.csv](https://databricksdemostore.blob.core.windows.net/data/02.02/UsedCars.csv)

![](https://media.geeksforgeeks.org/wp-content/uploads/finallpandas.png)

![](https://data-flair.training/blogs/wp-content/uploads/sites/2/2018/07/Python-Pandas-Tutorial-01.jpg)

![](https://data-flair.training/blogs/wp-content/uploads/sites/2/2019/04/Python-Pandas-Applications-1200x720.jpg)

In [1]:
import pandas as pd

In [0]:
%fs ls

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/user/,user/,0


In [0]:
%fs ls dbfs:/FileStore/tables/

path,name,size
dbfs:/FileStore/tables/UsedCars.csv,UsedCars.csv,69603
dbfs:/FileStore/tables/hotel_bookings.xls,hotel_bookings.xls,16855599


In [0]:
filelocation = "dbfs:/FileStore/tables/UsedCars.csv"

sparkDF = spark.read \
                .format('csv')\
                .option('inferSchema', 'true') \
                .option('header', 'true') \
                .load(filelocation)

In [0]:
p_df = sparkDF.toPandas()

In [3]:
p_df = pd.read_csv(r"D:\Training\UsedCars.csv")

# Basic Data Sanity Checks

In [0]:
p_df.head(5)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,7450.0,65.0,82000.0,Petrol,86,1,0,1300,3,1015
1,7250.0,74.0,130025.0,Petrol,110,1,0,1600,3,1050
2,8950.0,80.0,64000.0,Petrol,110,0,0,1600,3,1055
3,11450.0,54.0,62987.0,Petrol,110,0,0,1600,5,1080
4,,42.0,38932.0,Petrol,110,1,0,1600,3,1040


In [0]:
p_df.tail(5)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
1441,8750.0,68.0,101889.0,petrol,110,1,0,1600,5,1075
1442,9950.0,62.0,109547.0,petrol,110,0,0,1600,5,1075
1443,9250.0,80.0,44444.0,Petrol,110,1,0,1600,3,1050
1444,12450.0,54.0,46230.0,Petrol,110,1,0,1600,3,1055
1445,8500.0,77.0,62285.0,Petrol,110,1,0,1600,5,1075


In [0]:
p_df.shape

In [0]:
p_df.info()

In [0]:
p_df.dtypes

# Missing Value Investigation

In [0]:
p_df.isnull()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1441,False,False,False,False,False,False,False,False,False,False
1442,False,False,False,False,False,False,False,False,False,False
1443,False,False,False,False,False,False,False,False,False,False
1444,False,False,False,False,False,False,False,False,False,False


In [0]:
p_df.isnull().sum()

In [0]:
p_df.loc[p_df['Price'].isnull()]

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
4,,42.0,38932.0,Petrol,110,1,0,1600,3,1040
143,,60.0,,petrol,110,1,0,1600,5,1070
507,,64.0,,Diesel,72,1,0,2000,5,1135
514,,,71000.0,petrol,110,0,0,1600,5,1070
572,,,60239.0,Petrol,110,1,0,1600,5,1075
582,,,,Petrol,86,1,0,1300,3,1015
974,,72.0,105856.0,Petrol,86,0,0,1300,3,1015


# How to select few columns from the DF?

In [0]:
p_df[['Price', 'Age']].head(10)

Unnamed: 0,Price,Age
0,7450.0,65.0
1,7250.0,74.0
2,8950.0,80.0
3,11450.0,54.0
4,,42.0
5,6950.0,80.0
6,8250.0,70.0
7,12950.0,44.0
8,9950.0,65.0
9,7900.0,75.0


In [0]:
p_df.iloc[:, 0:2]

Unnamed: 0,Price,Age
0,7450.0,65.0
1,7250.0,74.0
2,8950.0,80.0
3,11450.0,54.0
4,,42.0
...,...,...
1441,8750.0,68.0
1442,9950.0,62.0
1443,9250.0,80.0
1444,12450.0,54.0


In [0]:
p_df.iloc[:, ['Price', 'Age']]

# How to select specific rows having few columns

# This will show just the subset of dataframe

In [0]:
p_df.iloc[0:5, 0:2]

Unnamed: 0,Price,Age
0,7450.0,65.0
1,7250.0,74.0
2,8950.0,80.0
3,11450.0,54.0
4,,42.0


# Creating a column Customizations

### Rename columns

In [0]:
p_df = p_df.rename(columns= {'Age': 'Age_in_Months'})

In [0]:
p_df.head()

Unnamed: 0,Price,Age_in_Months,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,7450.0,65.0,82000.0,Petrol,86,1,0,1300,3,1015
1,7250.0,74.0,130025.0,Petrol,110,1,0,1600,3,1050
2,8950.0,80.0,64000.0,Petrol,110,0,0,1600,3,1055
3,11450.0,54.0,62987.0,Petrol,110,0,0,1600,5,1080
4,,42.0,38932.0,Petrol,110,1,0,1600,3,1040


### Dropping a column

In [0]:
p_df.drop(columns = ['CC'], inplace = True)

In [0]:
p_df.head(5)

Unnamed: 0,Price,Age_in_Months,KM,FuelType,HP,MetColor,Automatic,Doors,Weight
0,7450.0,65.0,82000.0,Petrol,86,1,0,3,1015
1,7250.0,74.0,130025.0,Petrol,110,1,0,3,1050
2,8950.0,80.0,64000.0,Petrol,110,0,0,3,1055
3,11450.0,54.0,62987.0,Petrol,110,0,0,5,1080
4,,42.0,38932.0,Petrol,110,1,0,3,1040


### Creating a new column from an existing columns

In [0]:
p_df['Fueltype_Automatic'] = p_df['FuelType'] + p_df['Automatic']

In [0]:
p_df.dtypes

In [0]:
p_df['Automatic'].value_counts()

In [0]:
p_df['Fueltype_Automatic'] = p_df['FuelType'] + '_' + p_df['Automatic'].astype('str')

In [0]:
p_df.head(10)

Unnamed: 0,Price,Age_in_Months,KM,FuelType,HP,MetColor,Automatic,Doors,Weight,Fueltype_Automatic
0,7450.0,65.0,82000.0,Petrol,86,1,0,3,1015,Petrol_0
1,7250.0,74.0,130025.0,Petrol,110,1,0,3,1050,Petrol_0
2,8950.0,80.0,64000.0,Petrol,110,0,0,3,1055,Petrol_0
3,11450.0,54.0,62987.0,Petrol,110,0,0,5,1080,Petrol_0
4,,42.0,38932.0,Petrol,110,1,0,3,1040,Petrol_0
5,6950.0,80.0,62581.0,Petrol,110,0,0,5,1075,Petrol_0
6,8250.0,70.0,59017.0,petrol,107,1,1,3,1080,petrol_1
7,12950.0,44.0,41499.0,CNG,110,1,0,5,1103,CNG_0
8,9950.0,65.0,65513.0,Petrol,110,1,1,4,1070,Petrol_1
9,7900.0,75.0,125400.0,Petrol,110,0,0,3,1050,Petrol_0


# Eyeballing the  Data

![](https://image.shutterstock.com/image-photo/business-analysis-image-magnifying-glass-260nw-484361059.jpg)

In [0]:
p_df['FuelType'].nunique()

In [0]:
for i in p_df['FuelType'].unique():
  print(i)

In [0]:
p_df['FuelType'].value_counts()

In [0]:
p_df['FuelType'].value_counts().cumsum()/p_df['FuelType'].shape[0]

### Replace values in a column

In [0]:
p_df['FuelType'] = p_df['FuelType'].replace({'petrol': 'Petrol',
                                            'diesel': 'Diesel'})

In [0]:
p_df['FuelType'].value_counts().cumsum()/p_df['FuelType'].shape[0]

In [0]:
p_df['Automatic'].value_counts()

### What kind of FuelType and Automatic I have most in Usedcars shop?

In [0]:
p_df['Fueltype_Automatic'] = p_df['FuelType'] + '_' + p_df['Automatic'].astype('str')

In [0]:
p_df['Fueltype_Automatic'].value_counts().cumsum()/p_df.shape[0]

# Basic Statistics

### Below Code will get Summary Stats for numerical column

In [0]:
p_df.describe()

Unnamed: 0,Price,Age_in_Months,KM,HP,MetColor,Automatic,Doors,Weight
count,1439.0,1441.0,1440.0,1446.0,1446.0,1446.0,1446.0,1446.0
mean,10728.397498,55.957668,68534.575,101.466113,0.674274,0.055325,4.031812,1072.390733
std,3623.832644,18.57766,37476.024007,14.989312,0.468808,0.228693,0.953063,52.562095
min,4350.0,1.0,1.0,69.0,0.0,0.0,2.0,1000.0
25%,8450.0,44.0,43000.0,87.0,0.0,0.0,3.0,1040.0
50%,9900.0,61.0,63389.5,110.0,1.0,0.0,4.0,1070.0
75%,11950.0,70.0,87020.75,110.0,1.0,0.0,5.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,5.0,1615.0


In [0]:
p_df.describe(include = 'all')

Unnamed: 0,Price,Age_in_Months,KM,FuelType,HP,MetColor,Automatic,Doors,Weight,Fueltype_Automatic
count,1439.0,1441.0,1440.0,1446,1446.0,1446.0,1446.0,1446.0,1446.0,1446
unique,,,,5,,,,,,7
top,,,,Petrol,,,,,,Petrol_0
freq,,,,1272,,,,,,1193
mean,10728.397498,55.957668,68534.575,,101.466113,0.674274,0.055325,4.031812,1072.390733,
std,3623.832644,18.57766,37476.024007,,14.989312,0.468808,0.228693,0.953063,52.562095,
min,4350.0,1.0,1.0,,69.0,0.0,0.0,2.0,1000.0,
25%,8450.0,44.0,43000.0,,87.0,0.0,0.0,3.0,1040.0,
50%,9900.0,61.0,63389.5,,110.0,1.0,0.0,4.0,1070.0,
75%,11950.0,70.0,87020.75,,110.0,1.0,0.0,5.0,1085.0,


In [0]:
p_df.groupby('FuelType').agg({'KM' :['min', 'max', 'mean', 'median']
                             })

Unnamed: 0_level_0,KM,KM,KM,KM
Unnamed: 0_level_1,min,max,mean,median
FuelType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
CNG,41499.0,178858.0,120350.0,115191.0
CompressedNaturalGas,71793.0,207114.0,127173.0,114892.5
Diesel,1.0,243000.0,111977.580645,117000.0
Petrol,1.0,194545.0,62562.735016,60716.0
methane,43000.0,144000.0,102968.25,112436.5


In [0]:
p_df.groupby('FuelType').agg({'Age_in_Months': ['min', 'max', 'mean'],
                             'Price': ['min', 'max', 'mean']})

Unnamed: 0_level_0,Age_in_Months,Age_in_Months,Age_in_Months,Price,Price,Price
Unnamed: 0_level_1,min,max,mean,min,max,mean
FuelType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
CNG,39.0,80.0,56.666667,7460.0,12950.0,9378.888889
CompressedNaturalGas,44.0,79.0,60.25,5250.0,14950.0,9537.5
Diesel,4.0,80.0,50.847134,4350.0,32500.0,11275.038462
Petrol,1.0,80.0,56.585635,5250.0,24500.0,10678.592417
methane,37.0,64.0,51.75,7950.0,11950.0,9400.0


# Conditional Filters

In [0]:
p_df.loc[p_df['Age_in_Months'] > 60, 'Age_in_Months']

In [0]:
p_df.loc[p_df['Age_in_Months'] > 60]

Unnamed: 0,Price,Age_in_Months,KM,FuelType,HP,MetColor,Automatic,Doors,Weight,Fueltype_Automatic
0,7450.0,65.0,82000.0,Petrol,86,1,0,3,1015,Petrol_0
1,7250.0,74.0,130025.0,Petrol,110,1,0,3,1050,Petrol_0
2,8950.0,80.0,64000.0,Petrol,110,0,0,3,1055,Petrol_0
5,6950.0,80.0,62581.0,Petrol,110,0,0,5,1075,Petrol_0
6,8250.0,70.0,59017.0,Petrol,107,1,1,3,1080,Petrol_1
...,...,...,...,...,...,...,...,...,...,...
1440,7950.0,78.0,90011.0,Petrol,86,0,1,3,1045,Petrol_1
1441,8750.0,68.0,101889.0,Petrol,110,1,0,5,1075,Petrol_0
1442,9950.0,62.0,109547.0,Petrol,110,0,0,5,1075,Petrol_0
1443,9250.0,80.0,44444.0,Petrol,110,1,0,3,1050,Petrol_0


In [0]:
p_df.loc[p_df['Age_in_Months'] > 60, ['Age_in_Months', 'FuelType']]

Unnamed: 0,Age_in_Months,FuelType
0,65.0,Petrol
1,74.0,Petrol
2,80.0,Petrol
5,80.0,Petrol
6,70.0,Petrol
...,...,...
1440,78.0,Petrol
1441,68.0,Petrol
1442,62.0,Petrol
1443,80.0,Petrol


In [0]:
p_df.loc[p_df['Age_in_Months'] > 60, 'FuelType'].shape[0]/p_df.shape[0]

# Dataframes Concatenation

![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_concat_basic.png)

In [0]:
df1 = pd.DataFrame(
   ...:     {
   ...:         "A": ["A0", "A1", "A2", "A3"],
   ...:         "B": ["B0", "B1", "B2", "B3"],
   ...:         "C": ["C0", "C1", "C2", "C3"],
   ...:         "D": ["D0", "D1", "D2", "D3"],
   ...:     },
   ...:     index=[0, 1, 2, 3],
   ...: )
   ...: 

df2 = pd.DataFrame(
   ...:     {
   ...:         "A": ["A4", "A5", "A6", "A7"],
   ...:         "B": ["B4", "B5", "B6", "B7"],
   ...:         "C": ["C4", "C5", "C6", "C7"],
   ...:         "D": ["D4", "D5", "D6", "D7"],
   ...:     },
   ...:     index=[4, 5, 6, 7],
   ...: )
   ...: 

df3 = pd.DataFrame(
   ...:     {
   ...:         "A": ["A8", "A9", "A10", "A11"],
   ...:         "B": ["B8", "B9", "B10", "B11"],
   ...:         "C": ["C8", "C9", "C10", "C11"],
   ...:         "D": ["D8", "D9", "D10", "D11"],
   ...:     },
   ...:     index=[8, 9, 10, 11],
   ...: )
   ...: 

In [0]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [0]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [0]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [0]:
In [4]: frames = [df1, df2, df3]

In [5]: result = pd.concat(frames)

In [0]:
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


# Concatenating using append

In [0]:
result = df1.append([df2, df3])

In [0]:
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


# Joins in Pandas

![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key.png)

In [0]:
 left = pd.DataFrame(
   ....:     {
   ....:         "key": ["K0", "K1", "K2", "K3"],
   ....:         "A": ["A0", "A1", "A2", "A3"],
   ....:         "B": ["B0", "B1", "B2", "B3"],
   ....:     }
   ....: )
   ....: 

 right = pd.DataFrame(
   ....:     {
   ....:         "key": ["K0", "K1", "K2", "K3"],
   ....:         "C": ["C0", "C1", "C2", "C3"],
   ....:         "D": ["D0", "D1", "D2", "D3"],
   ....:     }
   ....: )
   ....: 

result = pd.merge(left, right, on="key")

In [0]:
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


# Left Outer Join

![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_left.png)

In [0]:
result = pd.merge(left, right, how="left", on=["key1", "key2"])

In [0]:
left = pd.DataFrame(
   ....:     {
   ....:         "key1": ["K0", "K0", "K1", "K2"],
   ....:         "key2": ["K0", "K1", "K0", "K1"],
   ....:         "A": ["A0", "A1", "A2", "A3"],
   ....:         "B": ["B0", "B1", "B2", "B3"],
   ....:     }
   ....: )
   ....: 

right = pd.DataFrame(
   ....:     {
   ....:         "key1": ["K0", "K1", "K1", "K2"],
   ....:         "key2": ["K0", "K0", "K0", "K0"],
   ....:         "C": ["C0", "C1", "C2", "C3"],
   ....:         "D": ["D0", "D1", "D2", "D3"],
   ....:     }
   ....: )
   ....: 

In [0]:
result = pd.merge(left, right, how="left", on=["key1", "key2"])

In [0]:
left

Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3


In [0]:
right

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [0]:
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


# Right Outer Join
![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_right.png)

In [0]:
result = pd.merge(left, right, how="right", on=["key1", "key2"])

In [0]:
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


# Outer Join
![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_outer.png)

In [0]:
result = pd.merge(left, right, how="outer", on=["key1", "key2"])


In [0]:
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,
5,K2,K0,,,C3,D3


# Inner Join
![](https://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_inner.png)

In [0]:
result = pd.merge(left, right, how="inner", on=["key1", "key2"])


In [0]:
result

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


![](https://images.squarespace-cdn.com/content/v1/55b6a6dce4b089e11621d3ed/1566488076547-NG1GQ1X350POR0AD0IQ9/Practice+on+your+Own_Large.png?format=300w)