In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')


In [3]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


## Beginning of HW

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [5]:
features = ['make', 'model', 'year', 'engine_hp','engine_cylinders', 'transmission_type', 'vehicle_style','highway_mpg', 'city_mpg']

In [6]:
string = list(df.dtypes[df.dtypes == 'object'].index)


In [7]:
for col in string:
    df[col]= df[col].str.lower().str.replace(" ", "_")


All data are a little tuned

In [8]:
df[features].dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [67]:
df[features].isnull().sum()
df[features] = df[features].fillna(0)

In [69]:
df[features].isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [70]:
df.rename(columns={'msrp': 'price'}, inplace=True)


In [71]:
select_df = df[features]

## Question 1 
What is the most frequent observation (mode) for the column transmission_type?



In [72]:
select_df['transmission_type'].mode()

0    automatic
Name: transmission_type, dtype: object

### Answer 1: automatic

## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [73]:
select_df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [74]:
select_df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [75]:
numeric_part = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [76]:
select_df[numeric_part].corrwith(df['price'])

year                0.227590
engine_hp           0.650095
engine_cylinders    0.526274
highway_mpg        -0.160043
city_mpg           -0.157676
dtype: float64

## Answers 2 : engine_hp and engine_cylinders

In [77]:
df['price'].mean()

40594.737032063116

In [78]:
above_average1 = df['price'] > df['price'].mean()

In [79]:
above_average1

0         True
1         True
2        False
3        False
4        False
         ...  
11909     True
11910     True
11911     True
11912     True
11913    False
Name: price, Length: 11914, dtype: bool

In [80]:
select_df['above_average'] = above_average1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  select_df['above_average'] = above_average1


data_dict = select_df.to_dict(orient='records')

## Question 3


In [82]:
from sklearn.model_selection import train_test_split

In [83]:
df_train, df_test = train_test_split(select_df, test_size= 0.4, random_state=42)

In [84]:
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [85]:
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [86]:
y_train = df_train['above_average']
y_val = df_val['above_average']
y_test = df_test['above_average']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [87]:
categoric_part = ['make', 'model', 'transmission_type', 'vehicle_style']

In [89]:
from sklearn.metrics import mutual_info_score

In [98]:
def calculate_mi (series):
    return np.round(mutual_info_score(series,y_train),2)


In [99]:
df_mi = df_train[categoric_part].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name= 'MI')
df_mi

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


## Answer 3 = Transmission 0.02