 #### Data Preparation

In [1]:
# Import necessary libraries
## Exploratory Data Analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes = True)

##Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer

from math import sqrt

In [2]:
#Import data
car_data = pd.read_csv("car-price_data.csv")

#Check data size
car_data.shape

(11914, 16)

In [3]:
car_data.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [4]:
car_data.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

#### Features
For the rest of the homework, columns required:
- Make,
- Model,
- Year,
- Engine HP,
- Engine Cylinders,
- Transmission Type,
- Vehicle Style,
- highway MPG,
- city mpg

Target variable = MSRP

In [5]:
features = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 
            'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']

# Create a new DataFrame with required features
data = car_data[features]


In [6]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11845 non-null  float64
 4   engine_cylinders   11884 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [8]:
data.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

Fill in the missing values of the selected features with 0.

In [9]:
data = data.copy()

In [10]:
# Fill missing values in 'engine_hp' and 'engine_cylinders' columns with 0
data['engine_hp'].fillna(0, inplace=True)

In [11]:
data['engine_cylinders'].fillna(0, inplace=True)

In [12]:
data.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

Rename MSRP variable to price.

In [13]:
data.rename(columns={'msrp': 'price'}, inplace=True)

In [14]:
data.head().T

Unnamed: 0,0,1,2,3,4
make,BMW,BMW,BMW,BMW,BMW
model,1 Series M,1 Series,1 Series,1 Series,1 Series
year,2011,2011,2011,2011,2011
engine_hp,335.0,300.0,300.0,230.0,230.0
engine_cylinders,6.0,6.0,6.0,6.0,6.0
transmission_type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
vehicle_style,Coupe,Convertible,Coupe,Coupe,Convertible
highway_mpg,26,28,28,28,28
city_mpg,19,19,20,18,18
price,46135,40650,36350,29450,34500


# Question 1
What is the most frequent observation (mode) for the column transmission_type?

In [15]:
data.transmission_type.mode()

0    AUTOMATIC
dtype: object

In [16]:
data.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

# Question 2
- Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

- What are the two features that have the biggest correlation in this dataset?

In [17]:
correlation_matrix = data.corr()

In [18]:
correlation_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [19]:
max_correlation = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()
max_correlation.head(2)


year      year           1.000000
city_mpg  highway_mpg    0.886829
dtype: float64

#### Make price binary
- Now we need to turn the price variable from numeric into a binary format.
- Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [20]:
mean_price = data.price.mean()
mean_price

40594.737032063116

In [21]:
data['above_average'] = (data['price'] > mean_price).astype(int)

In [22]:
data['above_average']

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int32

#### Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (price) is not in your dataframe.

In [23]:
X = data.drop(columns=['price', 'above_average'])
y = data['above_average']

In [24]:
X.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18


In [25]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: above_average, dtype: int32

In [26]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Question 3
- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).
- Which of these variables has the lowest mutual information score?

In [27]:
categorical_variables = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_variables)

['make', 'model', 'transmission_type', 'vehicle_style']


In [28]:
def calculate_mi(series):
    return round(mutual_info_score(series, y_train), 2)

df_mi = X_train[categorical_variables].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

display(df_mi.head())


Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


# Question 4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [29]:
categorical_variables

['make', 'model', 'transmission_type', 'vehicle_style']

In [30]:
numerical_variables = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
print(numerical_variables)

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']


In [31]:
train_dict = X_train[categorical_variables + numerical_variables].to_dict(orient='records')

In [32]:
train_dict[0]

{'make': 'Nissan',
 'model': 'Frontier',
 'transmission_type': 'AUTOMATIC',
 'vehicle_style': 'Crew Cab Pickup',
 'year': 2015,
 'engine_hp': 261.0,
 'engine_cylinders': 6.0,
 'highway_mpg': 21,
 'city_mpg': 15}

In [33]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [34]:
X_train = dv.transform(train_dict)

In [35]:
X_train.shape

(7148, 959)

#### Logistic Regression

In [36]:
# Initialize and fit the logistic regression model
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [37]:
val_dict = X_val[categorical_variables + numerical_variables].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [38]:
model.predict_proba(X_val)

array([[9.99259252e-01, 7.40747779e-04],
       [6.55526172e-06, 9.99993445e-01],
       [9.99275227e-01, 7.24772696e-04],
       ...,
       [9.97118649e-01, 2.88135105e-03],
       [4.01665983e-02, 9.59833402e-01],
       [9.79247377e-08, 9.99999902e-01]])

In [39]:
# Make predictions on the validation dataset
y_val_pred = model.predict(X_val)
y_val_pred

array([0, 1, 0, ..., 0, 1, 1])

In [40]:
# Calculate the accuracy on the validation dataset and round it to 2 decimal places
accuracy = accuracy_score(y_val, y_val_pred)
rounded_accuracy = round(accuracy, 2)

# Print the rounded accuracy
print("Validation Accuracy:", rounded_accuracy)

Validation Accuracy: 0.95


# Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?

In [41]:
# Train the model with all features
model.fit(X_train, y_train)

In [42]:
# Step 2: Calculate the accuracy with all features
y_pred_all = model.predict(X_val)
accuracy_all_features = accuracy_score(y_val, y_pred_all)
print(f"Accuracy with all features: {accuracy_all_features:.2f}")

Accuracy with all features: 0.95


In [43]:
feature_names = dv.get_feature_names_out()

In [44]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

X = data.drop('above_average', axis=1)
y = data['above_average']

# Apply one-hot encoding to categorical columns and concatenate with numeric features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_processed = pd.concat([
    X.drop(categorical_columns, axis=1),
    pd.DataFrame(encoder.fit_transform(X[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
], axis=1)

# Initialize and fit the Logistic Regression model with RFE
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
rfe = RFE(model, n_features_to_select=1).fit(X_processed, y)

# Get the ranking of each feature
feature_ranking = rfe.ranking_

# Create a dictionary to store feature names and their rankings
feature_rank_dict = dict(zip(X_processed.columns, feature_ranking))

# Find the feature with the smallest ranking (1 indicates most important)
smallest_ranked_feature = min(feature_rank_dict, key=feature_rank_dict.get)
print("The feature with the smallest ranking is:", smallest_ranked_feature)


The feature with the smallest ranking is: year


# Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

In [45]:
data_df = data.copy()

In [46]:
data_df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


In [47]:
data_df.drop('above_average', axis=1, inplace=True)

In [48]:
data_df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [49]:
# Apply a logarithmic transformation to the 'price' column
data_df['price'] = np.log1p(data_df['price'])

X = data_df.drop('price', axis=1)
y = data_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
# define the categorical columns
cat = data_df.select_dtypes(include=['object']).columns

# Create transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat)
    ])

# Create an empty DataFrame to store RMSE scores
rmse_scores = []

# Initialize the alpha values
alphas = [0, 0.01, 0.1, 1, 10]

# Fit Ridge regression models with different alpha values
for alpha in alphas:
    ridge = Ridge(alpha=alpha, solver='sag', random_state=42)
    model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', ridge)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append({'Alpha': alpha, 'RMSE': round(rmse, 3)})

# Convert the list of dictionaries to a DataFrame
rmse_scores_df = pd.DataFrame(rmse_scores)
print(rmse_scores_df)

   Alpha   RMSE
0   0.00  0.237
1   0.01  0.234
2   0.10  0.227
3   1.00  0.274
4  10.00  0.529
