In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')


In [3]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


## Beginning of HW

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [5]:
features = ['make', 'model', 'year', 'engine_hp','engine_cylinders', 'transmission_type', 'vehicle_style','highway_mpg', 'city_mpg']

In [6]:
string = list(df.dtypes[df.dtypes == 'object'].index)


In [7]:
for col in string:
    df[col]= df[col].str.lower().str.replace(" ", "_")


All data are a little tuned

In [8]:
df[features].dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [9]:
df[features].isnull().sum()
df[features] = df[features].fillna(0)

In [10]:
df[features].isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [11]:
df.rename(columns={'msrp': 'price'}, inplace=True)


In [12]:
select_df = df[features]

## Question 1 
What is the most frequent observation (mode) for the column transmission_type?



In [13]:
select_df['transmission_type'].mode()

0    automatic
Name: transmission_type, dtype: object

### Answer 1: automatic

## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [14]:
select_df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [15]:
select_df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [16]:
numeric_part = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [17]:
select_df[numeric_part].corrwith(df['price'])

year                0.227590
engine_hp           0.650095
engine_cylinders    0.526274
highway_mpg        -0.160043
city_mpg           -0.157676
dtype: float64

## Answers 2 : engine_hp and engine_cylinders

In [18]:
df['price'].mean()

40594.737032063116

In [19]:
above_average1 = df['price'] > df['price'].mean()

In [20]:
above_average1

0         True
1         True
2        False
3        False
4        False
         ...  
11909     True
11910     True
11911     True
11912     True
11913    False
Name: price, Length: 11914, dtype: bool

In [21]:
select_df['above_average'] = above_average1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  select_df['above_average'] = above_average1


## Question 3

Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the lowest mutual information score?


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
df_train, df_test = train_test_split(select_df, test_size= 0.4, random_state=42)

In [24]:
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [25]:
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

In [26]:
y_train = df_train['above_average']
y_val = df_val['above_average']
y_test = df_test['above_average']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [27]:
categoric_part = ['make', 'model', 'transmission_type', 'vehicle_style']

In [28]:
from sklearn.metrics import mutual_info_score

In [29]:
def calculate_mi (series):
    return np.round(mutual_info_score(series,y_train),2)


In [30]:
df_mi = df_train[categoric_part].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name= 'MI')
df_mi

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


## Answer 3 = Transmission

## Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?

In [31]:
train_dict = df_train.to_dict(orient='records')

In [32]:
from sklearn.feature_extraction import DictVectorizer

In [33]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [34]:
X_train = dv.transform(train_dict)

In [35]:
X_train[0]

array([1.500e+01, 6.000e+00, 2.610e+02, 2.100e+01, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 

In [36]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=acura', 'make=alfa_romeo', 'make=aston_martin', 'make=audi',
       'make=bentley', 'make=bmw', 'make=bugatti', 'make=buick',
       'make=cadillac', 'make=chevrolet', 'make=chrysler', 'make=dodge',
       'make=ferrari', 'make=fiat', 'make=ford', 'make=genesis',
       'make=gmc', 'make=honda', 'make=hummer', 'make=hyundai',
       'make=infiniti', 'make=kia', 'make=lamborghini', 'make=land_rover',
       'make=lexus', 'make=lincoln', 'make=lotus', 'make=maserati',
       'make=maybach', 'make=mazda', 'make=mclaren', 'make=mercedes-benz',
       'make=mitsubishi', 'make=nissan', 'make=oldsmobile',
       'make=plymouth', 'make=pontiac', 'make=porsche',
       'make=rolls-royce', 'make=saab', 'make=scion', 'make=spyker',
       'make=subaru', 'make=suzuki', 'make=tesla', 'make=toyota',
       'make=volkswagen', 'make=volvo', 'model=100', 'model=124_spider',
       'model=190-class', 'model=1_series', 'model

In [37]:
import math

In [38]:
def sigmoid (score):
    return 1/ (1 + math.exp(-score))

In [41]:
def lineer_regression(xi):
    result = bias
    for j in range(n):
        result = result + xi[j] * w[j]
    return result

    

In [42]:
def logistic_regression(xi):
    score = bias
    for j in range (n):
        score = score + xi[j] * wi[j]
    prob = sigmoid(score)
    return prob


In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
model = LogisticRegression(solver='liblinear',C= 10,max_iter=1000, random_state=42)

In [46]:
X_train.shape

(7148, 959)

In [47]:
y_train.shape

(7148,)

In [48]:
model.fit(X_train,y_train)

In [49]:
df_val.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [50]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [52]:
y_pred = model.predict_proba(X_val)

In [54]:
y_pred = model.predict_proba(X_val)[:,1]

In [62]:
output1 = y_pred > 0.5

In [64]:
np.round((y_val == output1).mean(),2)

0.94

## Answer 4 = 0.94 ~0.95

## Question 5 

Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

In [65]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'city_mpg': -0.005,
 'engine_cylinders': -0.113,
 'engine_hp': 0.036,
 'highway_mpg': 0.108,
 'make=acura': 0.981,
 'make=alfa_romeo': 0.576,
 'make=aston_martin': 0.248,
 'make=audi': 2.348,
 'make=bentley': 0.133,
 'make=bmw': 1.831,
 'make=bugatti': 0.0,
 'make=buick': -0.417,
 'make=cadillac': 2.034,
 'make=chevrolet': -1.299,
 'make=chrysler': -1.073,
 'make=dodge': -2.874,
 'make=ferrari': 0.303,
 'make=fiat': -0.226,
 'make=ford': -1.243,
 'make=genesis': 0.009,
 'make=gmc': -0.606,
 'make=honda': -1.453,
 'make=hummer': 0.229,
 'make=hyundai': -2.35,
 'make=infiniti': 0.03,
 'make=kia': -1.416,
 'make=lamborghini': 0.004,
 'make=land_rover': 1.921,
 'make=lexus': 1.259,
 'make=lincoln': 1.136,
 'make=lotus': 2.834,
 'make=maserati': 0.399,
 'make=maybach': 0.002,
 'make=mazda': -1.234,
 'make=mclaren': 0.0,
 'make=mercedes-benz': 0.864,
 'make=mitsubishi': -1.086,
 'make=nissan': -0.87,
 'make=oldsmobile': -0.642,
 'make=plymouth': 0.054,
 'make=pontiac': -1.991,
 'make=porsch

In [68]:
dv.get_feature_names_out(), model.coef_[0].round(3).min()

(array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
        'make=acura', 'make=alfa_romeo', 'make=aston_martin', 'make=audi',
        'make=bentley', 'make=bmw', 'make=bugatti', 'make=buick',
        'make=cadillac', 'make=chevrolet', 'make=chrysler', 'make=dodge',
        'make=ferrari', 'make=fiat', 'make=ford', 'make=genesis',
        'make=gmc', 'make=honda', 'make=hummer', 'make=hyundai',
        'make=infiniti', 'make=kia', 'make=lamborghini', 'make=land_rover',
        'make=lexus', 'make=lincoln', 'make=lotus', 'make=maserati',
        'make=maybach', 'make=mazda', 'make=mclaren', 'make=mercedes-benz',
        'make=mitsubishi', 'make=nissan', 'make=oldsmobile',
        'make=plymouth', 'make=pontiac', 'make=porsche',
        'make=rolls-royce', 'make=saab', 'make=scion', 'make=spyker',
        'make=subaru', 'make=suzuki', 'make=tesla', 'make=toyota',
        'make=volkswagen', 'make=volvo', 'model=100', 'model=124_spider',
        'model=190-class', 'model=1_