In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
car_data = pd.read_csv("toyota.csv",index_col=0)

### get_dtype_counts is removed in 2021. this function returns count of each data type

In [3]:
car_data.dtypes.value_counts()

int64      4
object     4
float64    2
dtype: int64

### select data based on data types. Can have multiple filters inside square braces

In [4]:
car_data.select_dtypes(include=None,exclude=[object])

Unnamed: 0,Price,Age,MetColor,Automatic,CC,Weight
0,13500,23.0,1.0,0,2000,1165
1,13750,23.0,1.0,0,2000,1165
2,13950,24.0,,0,2000,1165
3,14950,26.0,0.0,0,2000,1165
4,13750,30.0,0.0,0,2000,1170
...,...,...,...,...,...,...
1431,7500,,1.0,0,1300,1025
1432,10845,72.0,0.0,0,1300,1015
1433,8500,,0.0,0,1300,1015
1434,7250,70.0,1.0,0,1300,1015


### info() giving details summary of dataframe

In [5]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1336 non-null   float64
 2   KM         1436 non-null   object 
 3   FuelType   1336 non-null   object 
 4   HP         1436 non-null   object 
 5   MetColor   1286 non-null   float64
 6   Automatic  1436 non-null   int64  
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   object 
 9   Weight     1436 non-null   int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 123.4+ KB


### unique() elements of column

In [6]:
print(np.unique(car_data["HP"]))

['107' '110' '116' '192' '69' '71' '72' '73' '86' '90' '97' '98' '????']


### Check each column how automatic data values became different data type. Like print(np.unique(car_data["HP"]))

### We have a double and ???? question mark. That is reson it is read as object instead of int64

In [7]:
car_data = pd.read_csv("toyota.csv",index_col=0,na_values=["??","????"]) #All ?? and ???? will be replaced with nan

In [8]:
print(np.unique(car_data["HP"]))

[ 69.  71.  72.  73.  86.  90.  97.  98. 107. 110. 116. 192.  nan  nan
  nan  nan  nan  nan]


In [9]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1336 non-null   float64
 2   KM         1421 non-null   float64
 3   FuelType   1336 non-null   object 
 4   HP         1430 non-null   float64
 5   MetColor   1286 non-null   float64
 6   Automatic  1436 non-null   int64  
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   object 
 9   Weight     1436 non-null   int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 123.4+ KB


### Converting data type using astype()

In [10]:
car_data['MetColor'] = car_data['MetColor'].astype('object')
car_data['Automatic'] = car_data['Automatic'].astype('object')

In [11]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1336 non-null   float64
 2   KM         1421 non-null   float64
 3   FuelType   1336 non-null   object 
 4   HP         1430 non-null   float64
 5   MetColor   1286 non-null   object 
 6   Automatic  1436 non-null   object 
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   object 
 9   Weight     1436 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 123.4+ KB


### Changing to category data type the size reduces

In [12]:
car_data['FuelType'].nbytes

11488

In [13]:
car_data['FuelType'].astype('category').nbytes

1460

### replace word five with number 5

In [14]:
print(np.unique(car_data['Doors']))

['2' '3' '4' '5' 'five' 'four' 'three']


In [15]:
car_data['Doors'].replace('three',3,inplace=True)

In [16]:
car_data['Doors'].replace('five',5,inplace=True)

In [17]:
car_data['Doors'].replace('four',4,inplace=True)

In [18]:
car_data['Doors'] = car_data['Doors'].astype('int64')

In [19]:
car_data['Doors'].nbytes

11488

## Get count of missing values in each column

In [20]:
car_data.isnull().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

## Remove the nan rows completely. axis=0 removes the row with nan value. inplace=True makes changes in same dataframe

In [21]:
car_data.dropna(axis=0,inplace=True)

# Frequency Table, Correlations

In [22]:
pd.crosstab(index=car_data['FuelType'],columns='count',dropna=True) #dropna=True drop records with nan value

col_0,count
FuelType,Unnamed: 1_level_1
CNG,12
Diesel,116
Petrol,968


### Relation between two categorical variables. dropna removes the row having nan in any column

In [23]:
pd.crosstab(index=car_data['Automatic'],columns=car_data['FuelType'],dropna=True)

FuelType,CNG,Diesel,Petrol
Automatic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,12,116,910
1,0,0,58


## Even data_table can be used to read csv files

In [24]:
datanew = pd.read_table("toyota.csv",sep=",")

In [25]:
datanew

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1431,1431,7500,,20544,Petrol,86,1.0,0,1300,3,1025
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015
1433,1433,8500,,17016,Petrol,86,0.0,0,1300,3,1015
1434,1434,7250,70.0,??,,86,1.0,0,1300,3,1015


## Joint Probability using normalize=True

In [26]:
pd.crosstab(index=car_data['Automatic'],columns=car_data['FuelType'],normalize=True, dropna=True)

FuelType,CNG,Diesel,Petrol
Automatic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.010949,0.105839,0.830292
1,0.0,0.0,0.05292


## Margin probability using margins=True. Gives Total Sum

In [27]:
pd.crosstab(index=car_data['Automatic'],columns=car_data['FuelType'],normalize=True, dropna=True, margins=True)

FuelType,CNG,Diesel,Petrol,All
Automatic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.010949,0.105839,0.830292,0.94708
1,0.0,0.0,0.05292,0.05292
All,0.010949,0.105839,0.883212,1.0


## Conditional Probability

In [28]:
pd.crosstab(index=car_data['Automatic'],columns=car_data['FuelType'],normalize='columns', dropna=True, margins=True)

FuelType,CNG,Diesel,Petrol,All
Automatic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.0,1.0,0.940083,0.94708
1,0.0,0.0,0.059917,0.05292


## Correlations between variables

In [29]:
numerical_data = car_data.select_dtypes(exclude=[object])

In [30]:
corr_matrix = numerical_data.corr()

In [31]:
corr_matrix

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
Price,1.0,-0.877706,-0.601944,0.334261,0.09988,0.201034,0.532614
Age,-0.877706,1.0,0.525695,-0.162063,-0.084851,-0.170178,-0.442295
KM,-0.601944,0.525695,1.0,-0.368629,0.319733,-0.082495,-0.029703
HP,0.334261,-0.162063,-0.368629,1.0,0.037291,0.059517,0.084527
CC,0.09988,-0.084851,0.319733,0.037291,1.0,0.117831,0.623643
Doors,0.201034,-0.170178,-0.082495,0.059517,0.117831,1.0,0.304455
Weight,0.532614,-0.442295,-0.029703,0.084527,0.623643,0.304455,1.0
