# 📝 Problem Statement
Develop a machine learning model that can **predict the brand of a car** based on its characteristics (features such as engine size, fuel type, price, horsepower, etc.).

The model should assist dealerships, resellers, or recommendation systems in **automatically classifying vehicles** into one of several known car brands.



In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Build a Predictive Model

In [None]:
# Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Load data
df = pd.read_csv('/content/drive/MyDrive/edurekaai/_data/samples/car_brand_prediction.csv')
df.sample(10)
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
US.,162
Japan.,51
Europe.,48


## Preprocessing the Data

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mpg          261 non-null    float64
 1   cylinders    261 non-null    int64  
 2   cubicinches  261 non-null    int64  
 3   hp           261 non-null    int64  
 4   weightlbs    261 non-null    int64  
 5   time-to-60   261 non-null    int64  
 6   year         261 non-null    int64  
 7   brand        261 non-null    object 
dtypes: float64(1), int64(6), object(1)
memory usage: 16.4+ KB


In [None]:
# No Null Record in the Dataset
df.isnull().sum()
## So, no action

Unnamed: 0,0
mpg,0
cylinders,0
cubicinches,0
hp,0
weightlbs,0
time-to-60,0
year,0
brand,0


In [None]:
# No Outliers in the Dataset
## So, no action

In [None]:
# ENCODING
## brand is categorical value. We must convert it into numerical equivalent.
le = LabelEncoder()
df_encoded = df.copy()
df_encoded['brand'] = le.fit_transform(df_encoded['brand'])
df_encoded.head(5)

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,2
1,31.9,4,89,71,1925,14,1980,0
2,17.0,8,302,140,3449,11,1971,2
3,15.0,8,400,150,3761,10,1971,2
4,30.5,4,98,63,2051,17,1978,2


In [None]:
# SCALING
## mpg, cylinders, cubicinches, hp, weightlbs, time-to-60, year are numerical values
sc = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60', 'year']] = sc.fit_transform(df_scaled[['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60', 'year']])
df_scaled.sample(5)

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
177,-1.171127,1.393053,0.947764,1.079598,0.781612,-1.221287,-0.776684,2
140,0.468098,0.236974,-0.258003,0.21374,-0.355912,-0.877058,0.87588,2
34,0.877904,-0.919105,-0.957533,-0.973724,-1.192671,0.155628,0.325025,1
115,-0.65887,0.236974,0.220622,-0.281037,0.913855,1.188315,-0.225829,2
231,0.621775,-0.919105,-0.745833,-0.676858,-0.443684,1.188315,1.702163,2


In [None]:
# SEGREGRATE INDEPENDENT AND DEPENDENT VARIABLES
X = df_encoded.drop('brand', axis=1)
y = df_encoded['brand']

In [None]:
# SPLIT TRAIN AND TEST DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Train Shape: ", X_train.shape[0] * 100 / df_scaled.shape[0])
print("Test Shape", X_test.shape[0] * 100 / df_scaled.shape[0])

Train Shape:  79.69348659003832
Test Shape 20.306513409961685


## Training Model

In [None]:
# TRAIN the MODEL
from sklearn.ensemble import RandomForestClassifier
rf_model_0 = RandomForestClassifier() #Default 100 trees
rf_model_0.fit(X_train, y_train)


In [None]:
# PREDICT on KNOWN/TRAIN DATA
y_pred_train = rf_model_0.predict(X_train)

# PREDICT on TEST DATA
y_pred_test = rf_model_0.predict(X_test)

In [None]:
# CLASSIFICATION REPORT ON TRAIN DATA
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        39
           2       1.00      1.00      1.00       129

    accuracy                           1.00       208
   macro avg       1.00      1.00      1.00       208
weighted avg       1.00      1.00      1.00       208



In [None]:
# CLASSIFICATION REPORT ON TEST DATA
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.64      0.88      0.74         8
           1       0.80      0.67      0.73        12
           2       0.97      0.94      0.95        33

    accuracy                           0.87        53
   macro avg       0.80      0.83      0.81        53
weighted avg       0.88      0.87      0.87        53



**Observation**
The accuracy on unknown or test data is 0.89 when the accuracy of the train data is 1. It means, the model is overfitting. We will use GridSearch to tune the hyper-parameters.

In [None]:
# HYPER-PARAMERTER TUNNING IN RANDOM FOREST
## Hyper-Parameters Choosen for GridSearch:
## criterion, n_estimators, max_depth, min_samples_leaf

## https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
## Grid of parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [5, 10, 15] #np.arange(2, 10)
}
# print(param_grid)

from sklearn.model_selection import GridSearchCV

gridcv = GridSearchCV(rf_model_0, param_grid, cv=5) # (model, grid, cross_validation)
gridcv.fit(X_train, y_train)

# Find the best estimator
print(gridcv.best_estimator_)

RandomForestClassifier(max_depth=5, min_samples_leaf=5)


In [None]:
# Train the Model with best estimated hyper-parameters
rf_model_tunned = RandomForestClassifier(max_depth=5, min_samples_leaf=5)
rf_model_tunned.fit(X_train, y_train)

In [None]:
# Predict on known data
y_pred_train2 = rf_model_tunned.predict(X_train)

# Predict on test-data
y_pred_test2 = rf_model_tunned.predict(X_test)

In [None]:
# Classification Report on Train Data
print(classification_report(y_train, y_pred_train2))

              precision    recall  f1-score   support

           0       0.77      0.85      0.81        40
           1       0.80      0.72      0.76        39
           2       0.94      0.94      0.94       129

    accuracy                           0.88       208
   macro avg       0.84      0.84      0.83       208
weighted avg       0.88      0.88      0.88       208



In [None]:
# Classification Report on Test Data
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.45      0.62      0.53         8
           1       0.64      0.58      0.61        12
           2       0.97      0.91      0.94        33

    accuracy                           0.79        53
   macro avg       0.69      0.71      0.69        53
weighted avg       0.82      0.79      0.80        53

