## Model training and Evaluation

In [1]:
# import necessory packages
import pandas as pd
import numpy as np
import os
import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
# import cleaned data
data = pd.read_csv('cleaned_data.csv')

In [3]:
df = data.copy()

In [4]:
#pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)

In [5]:
# checking shape of dataset
df.shape

(1016, 15)

In [6]:
df.isnull().sum()

brand_names           0
price                 0
score               420
processor brand       0
processor type        4
processor gen       101
type of core         23
no of threads         0
Ram                   0
ram type              1
storage type          0
storage capacity      8
ppi                  34
OS                    0
Warranty             86
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016 entries, 0 to 1015
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand_names       1016 non-null   object 
 1   price             1016 non-null   int64  
 2   score             596 non-null    float64
 3   processor brand   1016 non-null   object 
 4   processor type    1012 non-null   object 
 5   processor gen     915 non-null    float64
 6   type of core      993 non-null    float64
 7   no of threads     1016 non-null   float64
 8   Ram               1016 non-null   int64  
 9   ram type          1015 non-null   object 
 10  storage type      1016 non-null   object 
 11  storage capacity  1008 non-null   float64
 12  ppi               982 non-null    float64
 13  OS                1016 non-null   object 
 14  Warranty          930 non-null    float64
dtypes: float64(7), int64(2), object(6)
memory usage: 119.2+ KB


In [8]:
num_col = [col for col in df if df[col].dtypes!='O']
cat_col = [col for col in df if df[col].dtypes=='O']

In [9]:
num_col

['price',
 'score',
 'processor gen',
 'type of core',
 'no of threads',
 'Ram',
 'storage capacity',
 'ppi',
 'Warranty']

In [10]:
num_col[1:]

['score',
 'processor gen',
 'type of core',
 'no of threads',
 'Ram',
 'storage capacity',
 'ppi',
 'Warranty']

In [11]:
cat_col

['brand_names',
 'processor brand',
 'processor type',
 'ram type',
 'storage type',
 'OS']

In [12]:
X = df.drop('price',axis=1)
y = df['price']

In [13]:
numeric_preprocessor = Pipeline(steps=[
                                        ("scaler", RobustScaler()),
                                        ("imputation_mean",SimpleImputer(missing_values=np.nan, strategy="mean"))
                                      ])

categorical_preprocessor = Pipeline(steps=[
                                            ("imputer",SimpleImputer(strategy="constant")),
                                            ("encoder", OneHotEncoder(sparse=False,drop='first',handle_unknown="ignore")),
                                        ])

preprocessor = ColumnTransformer([
                                    ("categorical", categorical_preprocessor, cat_col),
                                    ("numerical", numeric_preprocessor,  num_col[1:]),
                                ])

In [14]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 10)

In [15]:
X_train.shape

(812, 14)

In [16]:
# Transformed the preprocessed data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [17]:
X_train.shape

(812, 44)

In [18]:
# model training
model = Sequential([
                    Dense(256,kernel_initializer = 'he_uniform',activation='relu',input_dim = 44),
                    Dense(128,kernel_initializer = 'he_uniform',activation='relu'),
                    Dense(128,kernel_initializer = 'he_uniform',activation='relu'),
                    Dense(96,kernel_initializer = 'he_uniform',activation='relu'),
                    Dense(1,activation = 'linear')
                    ])

In [19]:
# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

In [20]:
# Combine the preprocessor and model into a single pipeline
regressor = Pipeline(steps=[('model', model)])

In [29]:
# Fit the pipeline to the training data
model_history = regressor.fit(X_train, y_train, model__epochs=500, model__batch_size=32,model__validation_split=0.15,model__verbose=0)

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               11520     
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dense_3 (Dense)             (None, 96)                12384     
                                                                 
 dense_4 (Dense)             (None, 1)                 97        
                                                                 
Total params: 73,409
Trainable params: 73,409
Non-trainable params: 0
_________________________________________________________________


In [31]:
# Evaluate the model on the test data
loss= model.evaluate(X_test, y_test)
loss



500195168.0

In [32]:
# Make predictions on the test data
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)



In [33]:
# Calculate the metrics
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

# Print the metrics
print("RMSE_train: ", rmse_train)
print("R2_train: ", r2_train)
print('**************************')
print("RMSE_test: ", rmse_test)
print("R2_test: ", r2_test)

RMSE_train:  24131.024769798823
R2_train:  0.8479778688933461
**************************
RMSE_test:  22365.044196159924
R2_test:  0.8175589933275735


In [34]:
y_pred_test[0:6]

array([[ 48620.605],
       [ 74108.97 ],
       [ 55578.715],
       [233764.84 ],
       [ 98010.234],
       [ 18446.457]], dtype=float32)

In [35]:
y_test[0:6]

998     56990
304     63350
271     51018
625    159990
951     68999
353     21833
Name: price, dtype: int64

In [36]:
def saveModel_path(model_dir="SAVED_MODELS"):
    os.makedirs(model_dir, exist_ok=True)
    fileName = time.strftime("Model_%Y_%m_%d_%H_%M_%S_.h5")    
    model_path = os.path.join(model_dir, fileName)
    print(f"your model will be saved at the following location\n{model_path}")
    return model_path

In [37]:
model.save(saveModel_path())

your model will be saved at the following location
SAVED_MODELS\Model_2023_02_18_10_23_40_.h5
