In [1]:
import pandas as pd
import numpy as np
import seaborn as sns # data visualization
from matplotlib import pyplot as plt
%matplotlib inline

### Get Dataset:

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [3]:
Features = ['Make','Model','Year','Engine HP', 'Engine Cylinders', 'Transmission Type',
           'Vehicle Style', 'highway MPG', 'city mpg','MSRP']

In [4]:
df = df[Features]

In [5]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
       'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg',
       'MSRP'],
      dtype='object')

### Data preparation

In [6]:
df.columns = df.columns.str.replace(' ','_').str.lower()

In [7]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [8]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [9]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
msrp                   int64
dtype: object

In [10]:
df['engine_hp'] = df['engine_hp'].fillna(0)
df['engine_cylinders'] = df['engine_hp'].fillna(0)


In [11]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [12]:
df.rename(columns={'msrp':'price'}, inplace =True)
df.columns


Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

## Question 1:
What is the most frequent observation (mode) for the column transmission_type?

In [13]:
df.describe()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
count,11914.0,11914.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,247.941749,247.941749,26.637485,19.733255,40594.74
std,7.57974,110.507669,110.507669,8.863001,8.987798,60109.1
min,1990.0,0.0,0.0,12.0,7.0,2000.0
25%,2007.0,170.0,170.0,22.0,16.0,21000.0
50%,2015.0,225.0,225.0,26.0,18.0,29995.0
75%,2016.0,300.0,300.0,30.0,22.0,42231.25
max,2017.0,1001.0,1001.0,354.0,137.0,2065902.0


In [14]:
df['transmission_type'].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [15]:
from statistics import mode

In [16]:
mode(df['transmission_type'])

'AUTOMATIC'

In [17]:
set(df['transmission_type'])

{'AUTOMATED_MANUAL', 'AUTOMATIC', 'DIRECT_DRIVE', 'MANUAL', 'UNKNOWN'}

## Question 2: 

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [18]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [19]:
df[numerical].describe()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
count,11914.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,247.941749,247.941749,26.637485,19.733255
std,7.57974,110.507669,110.507669,8.863001,8.987798
min,1990.0,0.0,0.0,12.0,7.0
25%,2007.0,170.0,170.0,22.0,16.0
50%,2015.0,225.0,225.0,26.0,18.0
75%,2016.0,300.0,300.0,30.0,22.0
max,2017.0,1001.0,1001.0,354.0,137.0


In [20]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,0.338714,0.25824,0.198171
engine_hp,0.338714,1.0,1.0,-0.415707,-0.424918
engine_cylinders,0.338714,1.0,1.0,-0.415707,-0.424918
highway_mpg,0.25824,-0.415707,-0.415707,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.424918,0.886829,1.0


In [21]:
df[numerical].corr().unstack().sort_values(ascending=False)

year              year                1.000000
engine_hp         engine_hp           1.000000
highway_mpg       highway_mpg         1.000000
engine_cylinders  engine_hp           1.000000
engine_hp         engine_cylinders    1.000000
engine_cylinders  engine_cylinders    1.000000
city_mpg          city_mpg            1.000000
                  highway_mpg         0.886829
highway_mpg       city_mpg            0.886829
year              engine_hp           0.338714
                  engine_cylinders    0.338714
engine_hp         year                0.338714
engine_cylinders  year                0.338714
year              highway_mpg         0.258240
highway_mpg       year                0.258240
year              city_mpg            0.198171
city_mpg          year                0.198171
engine_cylinders  highway_mpg        -0.415707
highway_mpg       engine_hp          -0.415707
                  engine_cylinders   -0.415707
engine_hp         highway_mpg        -0.415707
engine_cylind

highway_mpg       city_mpg

#### Make price binary:
Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [22]:
df['above_average'] = (df['price']>= df['price'].mean()).astype(int)
df['above_average']

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int32

In [23]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price', 'above_average'],
      dtype='object')

In [24]:
df = df.drop(['price'], axis=1)

In [25]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'above_average'],
      dtype='object')

In [20]:
from sklearn.model_selection import train_test_split

In [28]:
df_full_train, df_test = train_test_split(df,test_size=0.2, random_state=42 ) 

In [29]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [30]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [31]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [32]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

### Question 3: 



In [33]:
from sklearn.metrics import mutual_info_score

In [34]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'above_average'],
      dtype='object')

In [35]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [36]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
above_average          int32
dtype: object

In [37]:
def mi_calculation(series):
    return mutual_info_score(series, y_train)

df_mi = df_train[categorical].apply(mi_calculation).round(2)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


## Question 4:
* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

In [17]:
from sklearn.feature_extraction import DictVectorizer

In [76]:
# 1) convert dataframe into a dictionary

train_dict = df_train.to_dict(orient='records')

In [77]:
train_dict[0]

{'make': 'Mitsubishi',
 'model': 'Endeavor',
 'year': 2011,
 'engine_hp': 225.0,
 'engine_cylinders': 225.0,
 'transmission_type': 'AUTOMATIC',
 'vehicle_style': '4dr SUV',
 'highway_mpg': 19,
 'city_mpg': 15}

In [78]:
dv = DictVectorizer(sparse=False)

In [79]:
dv.fit(train_dict)

In [80]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [81]:
X_train = dv.transform(train_dict)

In [82]:
X_train.shape

(7148, 943)

In [83]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [85]:
LGmodel = LogisticRegression(solver ='liblinear', C=10, max_iter=1000, random_state=42)

In [86]:
LGmodel.fit(X_train,y_train)

In [87]:
y_pred = LGmodel.predict(X_val)

In [88]:
y_pred

array([0, 1, 0, ..., 0, 1, 1])

In [89]:
accuracy = np.round(accuracy_score(y_val, y_pred),2)
accuracy

0.95

## Question 5

* lLet's find the least useful feature using the feature elimination techniqu
* 
Train a model with all these features (using the same parameters asn question 4)
* .
Now exclude each feature from this set and train a model without  t. Record the accuracy for each mo.
* for each ach feature, calculate the difference between the original accuracy and the accuracy without the feat
* re.
Which of following feature has the smallest difference?

In [90]:
features = df_train.columns.to_list()
features

['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [91]:
original_accuracy = accuracy

In [92]:
Scores = pd.DataFrame(columns=['No_featured','Accuracy','Difference'])

In [93]:
for feature in features:
    features_subset = features.copy()
    features_subset.remove(feature)

    train_dict = df_train[features_subset].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    val_dict = df_val[features_subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver ='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    Feature_accuracy = accuracy_score(y_val,y_pred)

    Scores.loc[len(Scores)] = [feature, Feature_accuracy, original_accuracy-Feature_accuracy]
    

In [94]:
Scores

Unnamed: 0,No_featured,Accuracy,Difference
0,make,0.948384,0.001616
1,model,0.923626,0.026374
2,year,0.946706,0.003294
3,engine_hp,0.947125,0.002875
4,engine_cylinders,0.947125,0.002875
5,transmission_type,0.946286,0.003714
6,vehicle_style,0.932438,0.017562
7,highway_mpg,0.945867,0.004133
8,city_mpg,0.947125,0.002875


In [99]:
min_diff = Scores.Difference.min()
Scores[Scores.Difference == min_diff]

Unnamed: 0,No_featured,Accuracy,Difference
0,make,0.948384,0.001616


## Question 6* 
For this question, we'll see how to use a linear regression model from Scikit-Learn
* 
We'll need to use the original column price. Apply the logarithmic transformation to this colum
* .
Fit the Ridge regression model on the training da
  * a:
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these paramet
  * rs:
model = Ridge(alpha=a, solver="sag", random_stat
* =42)
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 
* , 10]
Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [47]:
df.head().T

Unnamed: 0,0,1,2,3,4
make,BMW,BMW,BMW,BMW,BMW
model,1 Series M,1 Series,1 Series,1 Series,1 Series
year,2011,2011,2011,2011,2011
engine_hp,335.0,300.0,300.0,230.0,230.0
engine_cylinders,335.0,300.0,300.0,230.0,230.0
transmission_type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
vehicle_style,Coupe,Convertible,Coupe,Coupe,Convertible
highway_mpg,26,28,28,28,28
city_mpg,19,19,20,18,18
price,46135,40650,36350,29450,34500


In [37]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [48]:
df['price'] = np.log1p(df['price'])

In [49]:
df_full_train, df_test = train_test_split(df,test_size=0.2, random_state=42 ) 

In [50]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [51]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [52]:
df_full_train.head().T

Unnamed: 0,3181,5357,4874,8102,10400
make,Cadillac,Mercedes-Benz,Kia,Dodge,Hyundai
model,CT6,GLS-Class,Forte,RAM 250,Tiburon
year,2016,2017,2016,1993,2008
engine_hp,265.0,449.0,173.0,180.0,172.0
engine_cylinders,265.0,449.0,173.0,180.0,172.0
transmission_type,AUTOMATIC,AUTOMATIC,AUTOMATIC,MANUAL,AUTOMATIC
vehicle_style,Sedan,4dr SUV,Coupe,Regular Cab Pickup,2dr Hatchback
highway_mpg,31,18,34,16,24
city_mpg,22,14,25,11,17
price,10.887362,11.449464,9.898023,7.601402,9.9651


In [53]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [54]:
del df_train['price']
del df_val['price']
del df_test['price']

In [57]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

scores = {}
for a in [0, 0.01, 0.1, 1, 10] :
    model = Ridge(alpha=a, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = mean_squared_error(y_val, y_pred, squared=False)
    scores[a] = round(score, 3)
    print(f'alpha = {a}:\t RMSE = {score}') 



alpha = 0:	 RMSE = 0.5072798695374928




alpha = 0.01:	 RMSE = 0.507279949042125




alpha = 0.1:	 RMSE = 0.507280664577195




alpha = 1:	 RMSE = 0.5072878185316793
alpha = 10:	 RMSE = 0.507359221459397




In [58]:
scores

{0: 0.507, 0.01: 0.507, 0.1: 0.507, 1: 0.507, 10: 0.507}

In [59]:
min(scores)

0