In [1]:
'''
Random Forest Classification Implementation:
'''
# Holiday Package Prediction

'''
Problem statement:
Trips and travel.com company wants to enable and establish a viable business model to expand
the customer base. One of the ways to expand the customer base is to introdcue a new offering of package tours.
Currently, there are 5 types of package tours available:
1. Basic
2. Standard
3. Deluxe
4. Super Deluxe
5. King

Looking at the data of the last year, we observed that 18% of the customers purchased the package.
However, the marketing cost was quite high because customers were contacted at random without
looking at the available information.
The company is now planning to launch a new product i.e. Wellness Package Tour.
Welness Package Tour is defined as Travel that allows the traveler to maintain,
enhance or kickstart a healthy lifesyle, 
and support or increase one's sense of well-being.

Howwever, this time company wants to harness the available data of existing and potential
customers to make the marketing expenditure more efficient.

'''



"\nProblem statement:\nTrips and travel.com company wants to enable and establish a viable business model to expand\nthe customer base. One of the ways to expand the customer base is to introdcue a new offering of package tours.\nCurrently, there are 5 types of package tours available:\n1. Basic\n2. Standard\n3. Deluxe\n4. Super Deluxe\n5. King\n\nLooking at the data of the last year, we observed that 18% of the customers purchased the package.\nHowever, the marketing cost was quite high because customers were contacted at random without\nlooking at the available information.\nThe company is now planning to launch a new product i.e. Wellness Package Tour.\nWelness Package Tour is defined as Travel that allows the traveler to maintain,\nenhance or kickstart a healthy lifesyle, \nand support or increase one's sense of well-being.\n\nHowwever, this time company wants to harness the available data of existing and potential\ncustomers to make the marketing expenditure more efficient.\n\n"

In [2]:
# Data Collection:
#https://www.kaggle.com/datasets/susant4learning/holiday-package-purchase-prediction?resource=download&select=Travel.csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
df = pd.read_csv(r'Travel.xls')
df.head()


Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


## Data Cleaning:
- Handle Missing Values
- Handling Duplicate Data
- Check Data Types
- Understand the Data

In [4]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [5]:
## Check all the categorical columns
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [6]:
df['Gender'] = df['Gender'].apply(lambda x:'Female' if x=='Fe Male' else x)
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [7]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [8]:
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':'Unmarried'})
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [9]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [10]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [11]:
## Checking missing values

feature_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
for feature in feature_with_na:
    #print(f"{feature} has {df[feature].isnull().sum()} missing values.")
    print(feature, np.round(df[feature].isnull().mean()*100, 5), '% missing values')

Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values


In [12]:
# statistical summary of numerical columns
df[feature_with_na].select_dtypes(exclude='object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


## Imputing Null Values:
- Impute median value for age column
- impute mode value for type of contract column
- impute median for duration of putch 
- impute mode for number of followups as it is a discrete variable
- impute mode for preferredPropertyStar
- impute median for number of trips
- impute mode for nmber of children visiting
- impute median for monthly income

In [13]:
df.Age.fillna(df.Age.median(), inplace=True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Age.fillna(df.Age.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [14]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [15]:
df.drop(columns=['CustomerID'], inplace=True)

In [16]:
df.head(1)

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0


## Feature Engineering:
### Feature Extraction:

In [17]:
# Create new columns for feature extraction
df['TotalVisiting'] = df.NumberOfChildrenVisiting + df.NumberOfPersonVisiting

In [18]:
df.drop(columns=['NumberOfChildrenVisiting', 'NumberOfPersonVisiting'], inplace=True)

In [19]:
# get all numerical columns
numerical_cols = [feature for feature in df.columns if df[feature].dtype!= 'O']
print(len(numerical_cols))

12


In [20]:
# get all numerical columns
categorical_cols = [feature for feature in df.columns if df[feature].dtype == 'O']
print(len(categorical_cols))

6


In [21]:
# discrete features - also knowns as categorical features
# are those features which have a limited number of unique values
discrete_features = [feature for feature in numerical_cols if len(df[feature].unique()) < 25]
print(len(discrete_features))

9


In [22]:
# continuous features - are those features which have a large number of unique values
continuous_features = [feature for feature in numerical_cols if feature not in discrete_features]
print(len(continuous_features))

3


## Train and Test Split, and Model Training:

In [23]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['ProdTaken'], axis=1)
y = df['ProdTaken']

In [24]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [25]:
X.head()

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [26]:
# separate dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3910, 17), (978, 17), (3910,), (978,))

### Fix the categorical features:

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ProdTaken               4888 non-null   int64  
 1   Age                     4888 non-null   float64
 2   TypeofContact           4888 non-null   object 
 3   CityTier                4888 non-null   int64  
 4   DurationOfPitch         4888 non-null   float64
 5   Occupation              4888 non-null   object 
 6   Gender                  4888 non-null   object 
 7   NumberOfFollowups       4888 non-null   float64
 8   ProductPitched          4888 non-null   object 
 9   PreferredPropertyStar   4888 non-null   float64
 10  MaritalStatus           4888 non-null   object 
 11  NumberOfTrips           4888 non-null   float64
 12  Passport                4888 non-null   int64  
 13  PitchSatisfactionScore  4888 non-null   int64  
 14  OwnCar                  4888 non-null   

In [28]:
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns
print("Categorical Features:", cat_features)
print("Numerical Features:", num_features)

Categorical Features: Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')
Numerical Features: Index(['Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups',
       'PreferredPropertyStar', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalVisiting'],
      dtype='object')


In [29]:
# One Hot Encoding for Categorical Features and Standardization for Numerical Features
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [30]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

In [31]:
preprocessor = ColumnTransformer(
    [
    ("OneHotEncoder", categorical_transformer, cat_features),
    ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [32]:
preprocessor

In [33]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [34]:
# Machine Learning Model - Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve,precision_score,recall_score,f1_score

In [35]:
from sklearn.metrics import roc_auc_score



models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
          }
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # train set metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_rocauc = roc_auc_score(y_train, y_train_pred)

    # test set metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_rocauc = roc_auc_score(y_test, y_test_pred)

    print(f"Model: {list(models.keys())[i]}")

    print(f"Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train ROC AUC: {train_rocauc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test ROC AUC: {test_rocauc:.4f}")
    print("\n" + "="*35 + "\n")


Model: Decision Tree
Train Accuracy: 1.0000, Train F1 Score: 1.0000, Train Precision: 1.0000, Train Recall: 1.0000, Train ROC AUC: 1.0000
Test Accuracy: 0.9162, Test F1 Score: 0.9155, Test Precision: 0.7978, Test Recall: 0.7644, Test ROC AUC: 0.8587


Model: Random Forest
Train Accuracy: 1.0000, Train F1 Score: 1.0000, Train Precision: 1.0000, Train Recall: 1.0000, Train ROC AUC: 1.0000
Test Accuracy: 0.9233, Test F1 Score: 0.9170, Test Precision: 0.9603, Test Recall: 0.6335, Test ROC AUC: 0.8136




In [36]:
# hyperparameter tuning
rf_params = {
    'max_depth': [5, 8,15, None, 10],
    'n_estimators': [100,200,500,1000],
    'min_samples_split': [2, 8,15,20],
    'max_features': [5, 7, "auto", 8],
}

In [37]:
randomcv_models = [
    ("RF", RandomForestClassifier(),rf_params)
]

In [38]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(model, params, n_iter=10, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\

Best parameters for RF: {'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 5, 'max_depth': None}


In [39]:
models = {
    "Random Forest": RandomForestClassifier(**model_param['RF'])
}

In [40]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # train the model
    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # train set metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_rocauc = roc_auc_score(y_train, y_train_pred)
    # test set metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_rocauc = roc_auc_score(y_test, y_test_pred)
    print(f"Model: {list(models.keys())[i]}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train ROC AUC: {train_rocauc:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test ROC AUC: {test_rocauc:.4f}")
    print("\n" + "="*35 + "\n")
    

Model: Random Forest
Train Accuracy: 1.0000, Train F1 Score: 1.0000, Train Precision: 1.0000, Train Recall: 1.0000, Train ROC AUC: 1.0000
Test Accuracy: 0.9305, Test F1 Score: 0.9251, Test Precision: 0.9767, Test Recall: 0.6597, Test ROC AUC: 0.8279




## Random FOrest Regression Implementation:
### Used Car Price Prediction

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df = pd.read_csv(r'cardekho_imputated.csv', index_col=[0])
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [43]:
# Data Cleaning:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [44]:
df.drop(['car_name', 'brand'], axis=1, inplace=True)

In [45]:
df.head(1)

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


In [46]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [47]:
# Getting all different Types of Features
num_features = df.select_dtypes(exclude=['object']).columns
cat_features = df.select_dtypes(include=['object']).columns
discrete_features = [feature for feature in num_features if len(df[feature].unique()) < 25]
continuous_features = [feature for feature in num_features if feature not in discrete_features]
print("Numerical Features:", len(num_features))
print("Categorical Features:", len(cat_features))
print("Discrete Features:", len(discrete_features))
print("Continuous Features:", len(continuous_features))

Numerical Features: 7
Categorical Features: 4
Discrete Features: 2
Continuous Features: 5


In [48]:
# independent and dependent features
X = df.drop(columns=['selling_price'], axis=1)
y = df['selling_price']

In [49]:
# Feature Engineering:
# Scaling and Encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [50]:
df['model'].value_counts()

model
i20             906
Swift Dzire     890
Swift           781
Alto            778
City            757
               ... 
Altroz            1
C                 1
Ghost             1
Quattroporte      1
Gurkha            1
Name: count, Length: 120, dtype: int64

In [51]:
# best to assign label encoder to the model column
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [52]:
# Create column transformer
from sklearn.compose import ColumnTransformer
num_features = X.select_dtypes(exclude=['object']).columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    transformers=[
        ('StandardScaler', numeric_transformer, num_features),
        ('OneHotEncoder', oh_transformer, onehot_columns),
    ], remainder='passthrough'
)

In [53]:
X = preprocessor.fit_transform(X)


In [54]:
# separte dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12328, 14), (3083, 14), (12328,), (3083,))

In [55]:
## Machine Learning Model - Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [56]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")

In [57]:
models = {
    'Random Forest': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'KNN Regressor': KNeighborsRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor()
}

In [58]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: Random Forest
Train Set Metrics:
MAE: 40285.1083, MSE: 20008721421.3230, RMSE: 141452.1878, R2: 0.9753
Test Set Metrics:
MAE: 101910.1597, MSE: 52158343083.8269, RMSE: 228382.0113, R2: 0.9307


Model: Linear Regression
Train Set Metrics:
MAE: 268101.6071, MSE: 306756099359.7596, RMSE: 553855.6665, R2: 0.6218
Test Set Metrics:
MAE: 279618.5794, MSE: 252550062888.5654, RMSE: 502543.5930, R2: 0.6645


Model: Ridge Regression
Train Set Metrics:
MAE: 268059.8015, MSE: 306756818740.9266, RMSE: 553856.3160, R2: 0.6218
Test Set Metrics:
MAE: 279557.2169, MSE: 252540243247.9686, RMSE: 502533.8230, R2: 0.6645


Model: Lasso Regression
Train Set Metrics:
MAE: 268099.2219, MSE: 306756104248.6379, RMSE: 553855.6710, R2: 0.6218
Test Set Metrics:
MAE: 279614.7453, MSE: 252549134941.5630, RMSE: 502542.6698, R2: 0.6645


Model: KNN Regressor
Train Set Metrics:
MAE: 91426.3628, MSE: 106193489990.2661, RMSE: 325873.4263, R2: 0.8691
Test Set Metrics:
MAE: 112526.3461, MSE: 64021344520.1508, RMSE: 2

In [59]:
# initialize hyperparameter tuning - for Random Forest and knn
knn_params = {
    'n_neighbors': [2, 3, 10, 30, 40, 50]}
rf_params = {
    "max_depth": [5, 8, 15, None, 10],
    "n_estimators": [100, 200, 500, 1000],
    "min_samples_split": [2, 8, 15, 20],
    "max_features": [5, 7, "auto", 8]
}


In [60]:
randomcv_models = [
    ("RF", RandomForestRegressor(), rf_params),
    ("KNN", KNeighborsRegressor(), knn_params)
]

In [61]:
from sklearn.model_selection import RandomizedSearchCV
model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(model, params, n_iter=10, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\stuar\Desktop\Data Science Learning\venv\lib\site-packages\sklearn\

Best parameters for RF: {'n_estimators': 100, 'min_samples_split': 2, 'max_features': 8, 'max_depth': 10}
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for KNN: {'n_neighbors': 10}


In [62]:
# Retraining model with best parameters
models = {
    "RandomForestClassifier":RandomForestClassifier(**model_param['RF']),
    "KNN Regressor": KNeighborsRegressor(**model_param['KNN'])
}

In [63]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: RandomForestClassifier
Train Set Metrics:
MAE: 68451.5737, MSE: 35252265006.4893, RMSE: 187755.8654, R2: 0.9565
Test Set Metrics:
MAE: 133573.4269, MSE: 107608576797.1537, RMSE: 328037.4625, R2: 0.8571


Model: KNN Regressor
Train Set Metrics:
MAE: 103470.4818, MSE: 132103417091.9857, RMSE: 363460.3377, R2: 0.8371
Test Set Metrics:
MAE: 117496.2131, MSE: 69636909424.4648, RMSE: 263888.0623, R2: 0.9075




### Adaboost Classifier


In [None]:
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [80]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    #'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'Adaboost': AdaBoostClassifier()
}


In [81]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: Logistic Regression
Train Set Metrics:
MAE: 192239.2927, MSE: 202705074302.4010, RMSE: 450227.8027, R2: 0.7501
Test Set Metrics:
MAE: 210308.5063, MSE: 228848147183.1414, RMSE: 478380.7554, R2: 0.6960


Model: Decision Tree Classifier
Train Set Metrics:
MAE: 5591.3368, MSE: 903904445.1655, RMSE: 30065.0037, R2: 0.9989
Test Set Metrics:
MAE: 134779.4762, MSE: 96559824040.0989, RMSE: 310740.7666, R2: 0.8717


Model: Random Forest Classifier
Train Set Metrics:
MAE: 5417.9104, MSE: 864040882.5438, RMSE: 29394.5723, R2: 0.9989
Test Set Metrics:
MAE: 126462.1716, MSE: 302712804497.4457, RMSE: 550193.4246, R2: 0.5979


Model: Adaboost
Train Set Metrics:
MAE: 352332.0896, MSE: 802169388870.8630, RMSE: 895639.0952, R2: 0.0109
Test Set Metrics:
MAE: 373618.5939, MSE: 746449139722.8755, RMSE: 863972.8814, R2: 0.0084




In [82]:
rf_params = {
    "max_depth":[5,8,15,None,10],
    "n_estimators":[100,200,500,1000],
    "min_samples_split":[2,8,15,20],
    "max_features":[5,7,"auto",8]
}

adaboost_params = {
    "n_estimators": [50, 60, 70, 80, 90],
    "algorithm": ['SAMME', 'SAMME.R']
}

In [83]:
randomcv_models = [
    ("RF", RandomForestClassifier(), rf_params),
    ("Adaboost", AdaBoostClassifier(), adaboost_params)
]


In [85]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(model, params, n_iter=10, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RF: {'n_estimators': 100, 'min_samples_split': 20, 'max_features': 7, 'max_depth': 15}
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Adaboost: {'n_estimators': 90, 'algorithm': 'SAMME'}


In [86]:
models = {
    "Random Forest": RandomForestClassifier(**model_param['RF']),
    "Adaboost": AdaBoostClassifier(**model_param['Adaboost'])
}


In [87]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: Random Forest
Train Set Metrics:
MAE: 59023.3615, MSE: 23613955223.8806, RMSE: 153668.3286, R2: 0.9709
Test Set Metrics:
MAE: 125926.6542, MSE: 86443274494.2021, RMSE: 294012.3713, R2: 0.8852


Model: Adaboost
Train Set Metrics:
MAE: 346608.2090, MSE: 797764637086.3076, RMSE: 893176.7110, R2: 0.0164
Test Set Metrics:
MAE: 367123.6215, MSE: 741486408616.8099, RMSE: 861096.0507, R2: 0.0150




## Adaboost Regression Implementation:

In [90]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [89]:
# Create a function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    

In [91]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor()
}

In [92]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: LinearRegression
Train Set Metrics:
MAE: 268101.6071, MSE: 306756099359.7596, RMSE: 553855.6665, R2: 0.6218
Test Set Metrics:
MAE: 279618.5794, MSE: 252550062888.5654, RMSE: 502543.5930, R2: 0.6645


Model: Ridge
Train Set Metrics:
MAE: 268059.8015, MSE: 306756818740.9266, RMSE: 553856.3160, R2: 0.6218
Test Set Metrics:
MAE: 279557.2169, MSE: 252540243247.9686, RMSE: 502533.8230, R2: 0.6645


Model: Lasso
Train Set Metrics:
MAE: 268099.2219, MSE: 306756104248.6379, RMSE: 553855.6710, R2: 0.6218
Test Set Metrics:
MAE: 279614.7453, MSE: 252549134941.5630, RMSE: 502542.6698, R2: 0.6645


Model: KNeighborsRegressor
Train Set Metrics:
MAE: 91426.3628, MSE: 106193489990.2661, RMSE: 325873.4263, R2: 0.8691
Test Set Metrics:
MAE: 112526.3461, MSE: 64021344520.1508, RMSE: 253024.3951, R2: 0.9150


Model: DecisionTreeRegressor
Train Set Metrics:
MAE: 5164.8199, MSE: 432524990.5364, RMSE: 20797.2352, R2: 0.9995
Test Set Metrics:
MAE: 122640.0611, MSE: 89321244682.9612, RMSE: 298866.6001, R

In [93]:
knn_params = {
    'n_neighbors': [2, 3, 10, 20, 40, 50]
}
rf_params = {
    "max_depth":[5,8,15,None,10],
    "n_estimators":[100,200,500,1000],
    "min_samples_split":[2,8,15,20],
    "max_features":[5,7,"auto",8]
}

adaboost_params = {
    "n_estimators": [50, 60, 70, 80, 90],
    "loss": ['linear', 'square', 'exponential'],
}

In [94]:
randomcv_models = [
    ("RF", RandomForestClassifier(), rf_params),
    ("KNN", KNeighborsRegressor(), knn_params),
    ("Adaboost", AdaBoostRegressor(), adaboost_params)
]

In [95]:
from sklearn.model_selection import RandomizedSearchCV  
model_param = {}
for name, model, params in randomcv_models:
    random_search = RandomizedSearchCV(model, params, n_iter=10, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    model_param[name] = random_search.best_params_
    print(f"Best parameters for {name}: {random_search.best_params_}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for RF: {'n_estimators': 200, 'min_samples_split': 15, 'max_features': 7, 'max_depth': None}
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for KNN: {'n_neighbors': 10}
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Adaboost: {'n_estimators': 90, 'loss': 'linear'}


In [102]:
# Retraining the models with best parameter
#Best parameters for Adaboost: {'n_estimators': 90, 'loss': 'linear'}
# Remove 'loss' key for AdaBoostClassifier, as it's only valid for AdaBoostRegressor
adaboost_clf_params = {k: v for k, v in model_param['Adaboost'].items() if k != 'loss'}

models = {
    "Random Forest": RandomForestClassifier(**model_param['RF']),
    "KNN Regressor": KNeighborsRegressor(**model_param['KNN']),
    "Adaboost": AdaBoostClassifier(**adaboost_clf_params)
}


In [103]:
import warnings
warnings.filterwarnings("ignore")

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # train the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Model: {list(models.keys())[i]}")
    
    print("Train Set Metrics:")
    evaluate_model(y_train, y_train_pred)
    
    print("Test Set Metrics:")
    evaluate_model(y_test, y_test_pred)
    
    print("\n" + "="*35 + "\n")

Model: Random Forest
Train Set Metrics:
MAE: 38074.5457, MSE: 8663187946.1389, RMSE: 93076.2480, R2: 0.9893
Test Set Metrics:
MAE: 116372.6484, MSE: 71967901480.9033, RMSE: 268268.3386, R2: 0.9044


Model: KNN Regressor
Train Set Metrics:
MAE: 103470.4818, MSE: 132103417091.9857, RMSE: 363460.3377, R2: 0.8371
Test Set Metrics:
MAE: 117496.2131, MSE: 69636909424.4648, RMSE: 263888.0623, R2: 0.9075


Model: Adaboost
Train Set Metrics:
MAE: 346608.2090, MSE: 797764637086.3076, RMSE: 893176.7110, R2: 0.0164
Test Set Metrics:
MAE: 367123.6215, MSE: 741486408616.8099, RMSE: 861096.0507, R2: 0.0150


