In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
data = pd.read_csv('Preprocessed_data.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Ram,Weight,Price,Touchscreen,Ips,ppi,Cpu_brand,HDD,SSD,Gpu_brand,os
0,0,Apple,Ultrabook,8,1.37,71378,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,1,Apple,Ultrabook,8,1.34,47895,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,2,HP,Notebook,8,1.86,30636,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No Os/Linux
3,3,Apple,Ultrabook,16,1.83,135195,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,4,Apple,Ultrabook,8,1.37,96095,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [None]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
X = data.drop(columns=['Price'])
y = np.log(data['Price'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2)

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7862080925646102
MAE:  0.2163334731718258


# Linear Regression with Feature Scaling

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11]),
    ('scaler', StandardScaler(),[2,3,4,5,6,8,9])
], remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7862080925646104
MAE:  0.2163334731718193


In [None]:
np.exp(0.21)

np.float64(1.2336780599567432)

# Ridge Regression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7959733696273845
MAE:  0.21356205771679765


# Ridge Regression with Feature Scaling

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11]),
    ('scaler', StandardScaler(),[2,3,4,5,6,8,9])
], remainder='passthrough')

step2 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7953387070027891
MAE:  0.21376277500203758


# Lasso Regressor

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = Lasso(alpha=0.001)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7882740626131124
MAE:  0.21689203195160014


# KNN

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8029947631133791
MAE:  0.19361346461561793


In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8403520632288226
MAE:  0.18111378213763368


# RandomForest Regression

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.7,
                              max_features=1,
                              max_depth=30)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8998496300596834
MAE:  0.14980883933241448


# ExtraTrees

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = ExtraTreesRegressor(n_estimators=100,
                            random_state=3,
                            max_samples=0.75,
                            max_features=0.75,
                            max_depth=40,
                            bootstrap=True)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8812497181893258
MAE:  0.1592935555837184


# AdaBoost

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = AdaBoostRegressor(n_estimators=50, learning_rate=0.9)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7962973872438783
MAE:  0.2305153422215676


# Gradient Boosting

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8837434667229287
MAE:  0.15837088859596263


# XGBoost

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = XGBRegressor(n_estimators=45, max_depth=5, learning_rate=0.2)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8925766226545009
MAE:  0.15596005684855418


# Further Recommendations
1. Feature Scaling can be applied for better performance
2. Feature Selection can also be considered

# Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor, StackingRegressor
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

rf = RandomForestRegressor(n_estimators=350, random_state=3, max_samples=0.5, max_features=0.75, max_depth=30)
gbdt = GradientBoostingRegressor(n_estimators=100, max_features=0.5)
xgb = XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)
lr = LinearRegression()
et = ExtraTreesRegressor(n_estimators=100, bootstrap=True,random_state=3, max_samples=0.5, max_features=0.75, max_depth=10)

step2 = VotingRegressor([('rf', rf),('gbdt', gbdt), ('xgb', xgb),('et', et)], weights=[5,1,1,1])

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8900693205391415
MAE:  0.15977205953620843


# Stacking

In [None]:
from sklearn.ensemble import VotingRegressor, StackingRegressor
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

estimators = [
('rf', RandomForestRegressor(n_estimators=350, random_state=3, max_samples=0.5, max_features=0.75, max_depth=30)),
('gbdt', GradientBoostingRegressor(n_estimators=100, max_features=0.5)),
('xgb', XGBRegressor(n_estimators=25, learning_rate=0.3, max_depth=5)),
('et', ExtraTreesRegressor(n_estimators=100, bootstrap=True,random_state=3, max_samples=0.5, max_features=0.75, max_depth=10))
]

step2 = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=100))

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8861765743444663
MAE:  0.1646085560364657


# Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [50,100,120,150,200,500],
    'random_state': [1,2,3,4,5,42],
    'max_samples':[0.1,0.3,0.4,0.5,0.6,0.7,0.8],
    'max_features': [1,2,3,4,5,6],
    'max_depth': [20,30,40,50,60]
}

In [None]:
grid = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_grid, cv=5, n_jobs=-1, verbose=2)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
pipe[1].best_estimator_

In [None]:
pipe[1].best_score_

np.float64(0.8678367328056387)

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'),[0,1,7,10,11])
], remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=200,
                              random_state=42,
                              max_samples=0.8,
                              max_features=6,
                              max_depth=60)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.9010292667270332
MAE:  0.14771788940565544


# Exporting model

In [None]:
import pickle
pickle.dump(data, open('data.pkl', 'wb'))
pickle.dump(pipe, open('pipe.pkl','wb'))