## Download and prepare dataset

In [18]:
import pandas as pd
import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-02 18:27:56--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv’


2023-10-02 18:27:57 (21.3 MB/s) - ‘data.csv’ saved [1475504/1475504]



In [3]:
# Reading the downloaded CSV file
data = pd.read_csv('data.csv')

In [4]:
# Selecting only the necessary columns
selected_columns = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP'
]

data = data[selected_columns]

# Renaming the columns
data.columns = data.columns.str.replace(' ', '_').str.lower()

# Filling missing values with 0
data = data.fillna(0)

# Renaming 'msrp' to 'price'
data = data.rename(columns={'msrp': 'price'})

In [11]:
# Display the first few rows of the data
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


### Question 1

What is the most frequent observation (mode) for the column transmission_type?

In [6]:
# Find the most frequent observation (mode) for the column 'transmission_type'
mode_transmission_type = data['transmission_type'].mode()[0]
print(f"The most frequent observation for the column 'transmission_type' is: {mode_transmission_type}")


The most frequent observation for the column 'transmission_type' is: AUTOMATIC


## Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [9]:
# Calculate the correlation matrix for numerical features
correlation_matrix = data.corr(numeric_only=True)

# Find the two features with the biggest correlation
correlation_matrix_values = correlation_matrix.abs().unstack()
correlation_matrix_values = correlation_matrix_values.sort_values(ascending=False)
correlation_matrix_values = correlation_matrix_values[correlation_matrix_values < 1.0]

# Get the two features with the biggest correlation
highest_corr_features = correlation_matrix_values.idxmax()
highest_corr_features

('highway_mpg', 'city_mpg')

## Make price binary

In [10]:
# Calculate the mean price
mean_price = data['price'].mean()

# Create a binary variable 'above_average'
data['above_average'] = (data['price'] > mean_price).astype(int)


## Split the data

In [12]:
from sklearn.model_selection import train_test_split

# Features and target variable
features = data.drop(['price', 'above_average'], axis=1)
target = data['above_average']

# First, split the data into train and temp (to further split into validation and test sets)
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.4, random_state=42)

# Then split the temp data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## Question 3
Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.

Round the scores to 2 decimals using round(score, 2).
Which of these variables has the lowest mutual information score?

In [16]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# Initialize a dictionary to hold variable names and their respective mutual information scores
mi_scores = {}

# List of categorical variables
categorical_vars = ['make', 'model', 'transmission_type', 'vehicle_style']

# Calculate mutual information scores
for var in categorical_vars:
    le = LabelEncoder()
    le.fit(X_train[var])
    X_train_enc = le.transform(X_train[var])
    mi = mutual_info_classif(X_train_enc.reshape(-1, 1), y_train)
    mi_scores[var] = round(mi[0], 2)

print("Mutual Information Scores:", mi_scores)



Mutual Information Scores: {'make': 0.02, 'model': 0.01, 'transmission_type': 0.02, 'vehicle_style': 0.04}


## Question 4

In [20]:
numerical_vars = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# One-hot encoding for categorical variables
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(X_train[categorical_vars])

X_train_ohe = ohe.transform(X_train[categorical_vars])
X_val_ohe = ohe.transform(X_val[categorical_vars])

# Combine numerical and one-hot encoded categorical features
X_train_combined = np.hstack([X_train[numerical_vars], X_train_ohe])
X_val_combined = np.hstack([X_val[numerical_vars], X_val_ohe])

# Create and train the model
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train_combined, y_train)

# Predict and evaluate
y_val_pred = model.predict(X_val_combined)
accuracy = round(accuracy_score(y_val, y_val_pred), 2)

accuracy




0.93

## Question 5

In [22]:
from sklearn.metrics import accuracy_score

# Original model with all features
model_all_features = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_all_features.fit(X_train_combined, y_train)
y_pred_all_features = model_all_features.predict(X_val_combined)
original_accuracy = accuracy_score(y_val, y_pred_all_features)

# Dictionary to store the differences in accuracies
accuracy_diff = {}

# Loop to remove each feature, retrain the model, and calculate the accuracy
for i, feature in enumerate(numerical_vars + categorical_vars):
    # Create new training and validation sets without the current feature
    X_train_reduced = np.delete(X_train_combined, np.s_[i:i+1], axis=1)
    X_val_reduced = np.delete(X_val_combined, np.s_[i:i+1], axis=1)
    
    # Train the model
    model_reduced = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and calculate accuracy
    y_pred_reduced = model_reduced.predict(X_val_reduced)
    reduced_accuracy = accuracy_score(y_val, y_pred_reduced)
    
    # Calculate the difference in accuracy
    accuracy_diff[feature] = original_accuracy - reduced_accuracy

# Sort the features by the difference in accuracy
sorted_accuracy_diff = sorted(accuracy_diff.items(), key=lambda x: x[1])

sorted_accuracy_diff, original_accuracy


([('year', -0.018464120856063748),
  ('highway_mpg', -0.018044481745698726),
  ('transmission_type', -0.014687368862777994),
  ('vehicle_style', -0.014687368862777994),
  ('city_mpg', -0.013428451531682706),
  ('make', -0.01258917331095255),
  ('engine_cylinders', -0.010071338648762085),
  ('model', 0.0),
  ('engine_hp', 0.004196391103650887)],
 0.9278220730172052)

In [27]:
# Filter out the feature with zero difference
non_zero_diff = [item for item in sorted_accuracy_diff if item[1] != 0]

# Find the feature with the smallest absolute difference
smallest_diff_feature = min(non_zero_diff, key=lambda x: abs(x[1]))[0]
smallest_diff_feature

'engine_hp'

## Question 6

In [31]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Apply logarithmic transformation to the 'price' variable
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Initialize variables to store RMSE scores
rmse_scores = {}

# List of alpha values to try
alphas = [0, 0.01, 0.1, 1, 10]

# Loop through each alpha value
for alpha in alphas:
    # Train the Ridge regression model
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_combined, y_train_log)
    
    # Make predictions
    y_pred_log = ridge_model.predict(X_val_combined)
    
    # Calculate RMSE
    rmse_score = np.sqrt(mean_squared_error(y_val_log, y_pred_log))
    
    # Store RMSE score
    rmse_scores[alpha] = round(rmse_score, 3)

# Print RMSE scores
print(rmse_scores)


{0: 1373919371228.276, 0.01: 0.15, 0.1: 0.15, 1: 0.151, 10: 0.165}


In [32]:
## 0.01