In [None]:
'''
Machine Learning Project
Abbas Yazdan Mehr - Reyhane Naseri Moghadam
'''

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Feature Engineering**

In [None]:
data =pd.read_csv("drive/My Drive/Machine Learning/FinalProject/dataset.csv")

In [None]:
# Check the properties of each column of dataset
'''
Results:
*   total number of rows : 1152
1.  need to deal with missing values :
    hdd          -> 188
    graphic_ram  -> 896
    ssd          -> 956
2.  If we have fixed the missing values,
    we will have at least "1037" rows of data to check

3.  drop the "link" column
4.  make "price" column as target

5.  cpu : one-hot encoding
6. standardize the price values
7. stock_status : categorical -> numerical

8. omit "gb/tb/mb" -> ram,hdd,ssd,graphic_ram , "inch" -> screen_size
'''
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1152 entries, 0 to 1151
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   link          1152 non-null   object
 1   cpu           1089 non-null   object
 2   ram           1037 non-null   object
 3   hdd           188 non-null    object
 4   ssd           956 non-null    object
 5   graphic_ram   896 non-null    object
 6   screen_size   1053 non-null   object
 7   stock_status  1152 non-null   object
 8   price         1152 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 81.1+ KB


In [None]:
# Step-1: Drop the "link" column
data = data.drop(['link'], axis=1)

In [None]:
'''
# Step-2 : Take the values of "price" column as target
target =  list(data["price"])
'''
# Step-2: Replacement

# Part.1 : convert

# NaN     -> 0 (in every column)
# data = data.replace(np.nan,0,regex=True)
# unified -> 0 (in graphic_ram)
data['graphic_ram'] = data['graphic_ram'].str.replace('unified','0')

# Part.2 : omit

# ram,hdd,ssd,graphic_ram -> omit "gb/tb"
data = data.replace('gb','',regex=True)
data = data.replace('tb','',regex=True)
data = data.replace('mb','',regex=True)
data = data.replace('kb','',regex=True)
# screen_size -> omit "inch"
data['screen_size'] = data['screen_size'].str.replace('inch','')

In [None]:
# Step-3 : Imputing the Missing Values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
item_list = list(data.columns.values)
# Attention that 'cpu' and 'stock_status' are categorical and haven't changed to numerical yet
column_list = [e for e in item_list if e not in ('cpu', 'stock_status')]
for column in column_list:
  data[column] = imp.fit_transform(data[column].values.reshape(-1, 1))

imp = SimpleImputer(missing_values=0.0, strategy='most_frequent')
data['graphic_ram'] = imp.fit_transform(data['graphic_ram'].values.reshape(-1, 1) )

In [None]:
data.head(10)

Unnamed: 0,ram,hdd,ssd,graphic_ram,screen_size,price,cpu_A4,cpu_A6,cpu_Athlon,cpu_Celeron,...,cpu_M1,cpu_M2,cpu_Pentium,cpu_Ryzen 3,cpu_Ryzen 5,cpu_Ryzen 7,cpu_Ryzen 9,cpu_Xeon,stock_status_new,stock_status_stock
0,4.0,1.0,261.453975,10.546875,15.6,-0.740562,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,16.0,113.946809,512.0,8.0,15.6,0.231511,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,16.0,113.946809,512.0,4.0,15.6,-0.21498,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,8.0,113.946809,512.0,4.0,15.6,-0.257538,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,8.0,113.946809,256.0,10.546875,13.0,-0.164147,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,12.0,1.0,256.0,10.546875,15.6,-0.593948,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,4.0,1.0,128.0,10.546875,15.6,-0.736144,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,8.0,113.946809,512.0,4.0,15.6,-0.31277,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,8.0,113.946809,512.0,4.0,15.6,-0.174611,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,16.0,113.946809,512.0,4.0,15.6,-0.072382,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Step-4 :  One-Hot Encoding (Alternative : Lable Encoding)

data = pd.get_dummies(data, columns = ['cpu'])
data['stock_status'] = data['stock_status'].astype('category').cat.codes # !!! IT CAN GET DESCENDING !!!

"""
Alternative :
# Label Encoding for "cpu" values (faster way)
data['cpu'] = data['cpu'].astype('category').cat.codes
data = pd.get_dummies(data, columns = ['stock_status'])

# Another way for One-Hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder
enc_data = OneHotEncoder().fit_transform(['cpu']).toarray()
data = data.join(enc_data)
"""

'\nAlternative :\n# Label Encoding for "cpu" values (faster way)\ndata[\'cpu\'] = data[\'cpu\'].astype(\'category\').cat.codes\ndata = pd.get_dummies(data, columns = [\'stock_status\'])\n\n# Another way for One-Hot encoding with sklearn\nfrom sklearn.preprocessing import OneHotEncoder\nenc_data = OneHotEncoder().fit_transform([\'cpu\']).toarray()\ndata = data.join(enc_data)\n'

In [None]:
# Step-5 : Standardization the price values
import scipy
from scipy import stats
# Z-Score using scipy
data['price'] = stats.zscore(data['price'])

In [None]:
data.head(10)

Unnamed: 0,ram,hdd,ssd,graphic_ram,screen_size,stock_status,price,cpu_A4,cpu_A6,cpu_Athlon,...,cpu_Core i7,cpu_Core i9,cpu_M1,cpu_M2,cpu_Pentium,cpu_Ryzen 3,cpu_Ryzen 5,cpu_Ryzen 7,cpu_Ryzen 9,cpu_Xeon
0,4.0,1.0,261.453975,10.546875,15.6,0,-0.740562,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16.0,113.946809,512.0,8.0,15.6,0,0.231511,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,16.0,113.946809,512.0,4.0,15.6,0,-0.21498,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8.0,113.946809,512.0,4.0,15.6,0,-0.257538,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8.0,113.946809,256.0,10.546875,13.0,0,-0.164147,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,12.0,1.0,256.0,10.546875,15.6,0,-0.593948,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4.0,1.0,128.0,10.546875,15.6,0,-0.736144,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,8.0,113.946809,512.0,4.0,15.6,0,-0.31277,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8.0,113.946809,512.0,4.0,15.6,0,-0.174611,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,16.0,113.946809,512.0,4.0,15.6,0,-0.072382,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
item_list = list(data.columns.values)
# Attention that 'cpu' and 'stock_status' are categorical and haven't changed to numerical yet
column_list = [e for e in item_list if e not in ('price')]

X = data[column_list]
y = data['price']

In [None]:
# here we use the "train_test_split" to divide our dataset into train and test set -> train=0.7 dataset + test=0.3 dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

regr = RandomForestRegressor(max_depth=20, random_state=20)
regr.fit(X_train, y_train)
predictions = regr.predict(X_test)
score = regr.score(X_test,y_test)
print(score)

0.5855287852351527


# XGBoost

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

le = LabelEncoder()
y_train = le.fit_transform(y_train)

model = XGBRegressor(n_estimators = 400, learning_rate = 0.0000001, max_depth = 10)
model.fit(X_train.values, y_train)

In [None]:
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("mse: ", mse)
print("rmse: ", rmse)

mse:  10.043265474407098
rmse:  3.169111148951248
