# Assignment

## The task is to define and train a machine learning model for predicting the price of a laptop (`buynow_price` column in the dataset) based on its attributes. When testing and comparing the models, the aim to minimize the RMSE measure

<a id="import"></a>
## <b><span style="color:navy">Step 1.1 | </span><span style="color:red">Import Libraries</span></b>

In [2]:
# Numpy
import numpy as np

# Pandas
import pandas as pd

# Matplotlib
import matplotlib.pyplot as plt

# Seaborn
import seaborn as sns

# Warnings
import warnings

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Color output
from termcolor import colored

# Datetime
import datetime

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers
from tensorflow.keras.utils import to_categorical

print(colored('\nAll libraries imported succesfully', 'green'))

[32m
All libraries imported succesfully[0m


<a id="config"></a>
## <b><span style="color:navy">Step 1.2 | </span><span style="color:red">Library configurations</span></b>

In [3]:
# pd.options.mode.copy_on_write = True # Allow re-write on variable
# pd.set_option('display.max_columns', None) # Setting this option will print all collumns of a dataframe
# pd.set_option('display.max_colwidth', None) # Setting this option will print all of the data in a featur
# pd.options.mode.copy_on_write = True

# sns.set_style('darkgrid') # Seaborn style

# warnings.filterwarnings('ignore') # Ignore warnings

# print(colored('\nAll libraries configed succesfully', 'green'))

<a id="load_data"></a>
## <b><span style="color:navy">Step 2.1 | </span><span style="color:red">Importing Data</span></b>

In [4]:
# Read the train data
train = pd.read_json('train_dataset.json')
train.head()

Unnamed: 0,graphic card type,communications,resolution (px),CPU cores,RAM size,operating system,drive type,input devices,multimedia,RAM type,CPU clock speed (GHz),CPU model,state,drive memory size (GB),warranty,screen size,buynow_price
7233,dedicated graphics,"[bluetooth, lan 10/100/1000 mbps]",1920 x 1080,4,32 gb,[no system],ssd + hdd,"[keyboard, touchpad, illuminated keyboard, num...","[SD card reader, camera, speakers, microphone]",ddr4,2.6,intel core i7,new,1250.0,producer warranty,"17"" - 17.9""",4999.0
5845,dedicated graphics,"[wi-fi, bluetooth, lan 10/100 mbps]",1366 x 768,4,8 gb,[windows 10 home],ssd,"[keyboard, touchpad, numeric keyboard]","[SD card reader, camera, speakers, microphone]",ddr3,2.4,intel core i7,new,256.0,seller warranty,"15"" - 15.9""",2649.0
10303,,"[bluetooth, nfc (near field communication)]",1920 x 1080,2,8 gb,[windows 10 home],hdd,,[SD card reader],ddr4,1.6,intel core i7,new,1000.0,producer warranty,"15"" - 15.9""",3399.0
10423,,,,2,,,,,,,,,new,,producer warranty,,1599.0
5897,integrated graphics,"[wi-fi, bluetooth]",2560 x 1440,4,8 gb,[windows 10 home],ssd,"[keyboard, touchpad, illuminated keyboard]","[SD card reader, camera, speakers, microphone]",ddr4,1.2,other CPU,new,256.0,producer warranty,"12"" - 12.9""",4499.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4711 entries, 7233 to 6037
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       4417 non-null   object 
 1   communications          4261 non-null   object 
 2   resolution (px)         4361 non-null   object 
 3   CPU cores               4711 non-null   object 
 4   RAM size                4457 non-null   object 
 5   operating system        4335 non-null   object 
 6   drive type              4454 non-null   object 
 7   input devices           4321 non-null   object 
 8   multimedia              4310 non-null   object 
 9   RAM type                4212 non-null   object 
 10  CPU clock speed (GHz)   4181 non-null   float64
 11  CPU model               4389 non-null   object 
 12  state                   4711 non-null   object 
 13  drive memory size (GB)  4439 non-null   float64
 14  warranty                4711 non-null

In [6]:
# Read the test data
test = pd.read_json('test_dataset.json')
test.head()

Unnamed: 0,graphic card type,communications,resolution (px),CPU cores,RAM size,operating system,drive type,input devices,multimedia,RAM type,CPU clock speed (GHz),CPU model,state,drive memory size (GB),warranty,screen size,buynow_price
5124,dedicated graphics,"[bluetooth, lan 10/100 mbps]",1920 x 1080,2,8 gb,[windows 10 home],hdd,"[keyboard, touchpad]","[SD card reader, camera, microphone]",ddr4,2.0,intel core i3,new,1000.0,producer warranty,"15"" - 15.9""",2369.0
9595,integrated graphics,"[bluetooth, lan 10/100/1000 mbps, lan 10/100 m...",1920 x 1080,2,4 gb,[windows 10 home],ssd,"[keyboard, touchpad]","[SD card reader, camera, microphone]",ddr4,2.0,intel core i3,new,240.0,producer warranty,"15"" - 15.9""",2299.0
4926,dedicated graphics,"[wi-fi, bluetooth, lan 10/100/1000 mbps]",1920 x 1080,4,16 gb,[windows 10 home],ssd + hdd,"[keyboard, touchpad, illuminated keyboard, num...","[SD card reader, camera, speakers, microphone]",ddr4,2.8,intel core i7,new,1500.0,producer warranty,"15"" - 15.9""",5999.0
2607,integrated graphics,"[lan 10/100/1000 mbps, nfc (near field communi...",3200 x 1800,2,8 gb,[windows 10 home],ssd,"[keyboard, touchpad]","[camera, speakers, microphone]",ddr3l,,intel core i5,new,256.0,producer warranty,"13"" - 13.9""",3399.0
1185,integrated graphics,"[wi-fi, bluetooth, lan 10/100 mbps]",1366 x 768,2,4 gb,[windows 10 home],hdd,"[keyboard, touchpad, numeric keyboard]","[SD card reader, camera, speakers, microphone]",ddr3,1.6,intel celeron dual-core,new,500.0,producer warranty,"15"" - 15.9""",1299.0


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1571 entries, 5124 to 1371
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       1469 non-null   object 
 1   communications          1401 non-null   object 
 2   resolution (px)         1442 non-null   object 
 3   CPU cores               1571 non-null   object 
 4   RAM size                1468 non-null   object 
 5   operating system        1431 non-null   object 
 6   drive type              1479 non-null   object 
 7   input devices           1428 non-null   object 
 8   multimedia              1415 non-null   object 
 9   RAM type                1383 non-null   object 
 10  CPU clock speed (GHz)   1361 non-null   float64
 11  CPU model               1464 non-null   object 
 12  state                   1571 non-null   object 
 13  drive memory size (GB)  1466 non-null   float64
 14  warranty                1571 non-null

In [8]:
# Read the validation data
validation = pd.read_json('val_dataset.json')
validation.head()

Unnamed: 0,graphic card type,communications,resolution (px),CPU cores,RAM size,operating system,drive type,input devices,multimedia,RAM type,CPU clock speed (GHz),CPU model,state,drive memory size (GB),warranty,screen size,buynow_price
3849,dedicated graphics,"[bluetooth, lan 10/100/1000 mbps, lan 10/100 m...",1920 x 1080,4,8 gb,[windows 10 home],ssd + hdd,"[keyboard, touchpad, illuminated keyboard]","[SD card reader, camera, speakers, microphone]",ddr4,2.5,intel core i5,new,1128.0,producer warranty,"15"" - 15.9""",3829.0
3904,dedicated graphics,"[bluetooth, lan 10/100 mbps]",1366 x 768,4,8 gb,[windows 10 home],ssd,"[keyboard, touchpad, numeric keyboard]","[SD card reader, camera, speakers, microphone]",ddr3,2.2,intel core i7,new,256.0,seller warranty,"15"" - 15.9""",2786.5
8356,dedicated graphics,"[bluetooth, lan 10/100/1000 mbps, lan 10/100 m...",1920 x 1080,4,16 gb,[windows 10 home],ssd + hdd,"[keyboard, touchpad, illuminated keyboard]","[SD card reader, camera, speakers, microphone]",ddr4,2.5,intel core i5,new,1256.0,producer warranty,"15"" - 15.9""",4269.0
6022,dedicated graphics,"[bluetooth, lan 10/100 mbps]",1920 x 1080,2,8 gb,[windows 10 home],ssd,"[keyboard, touchpad]","[SD card reader, camera, microphone]",ddr4,2.5,intel core i5,new,480.0,producer warranty,"15"" - 15.9""",4239.0
9678,,"[bluetooth, nfc (near field communication)]",1920 x 1080,3,8 gb,[windows 10 home],hdd,[touchpad],[SD card reader],ddr4,1.6,intel core i5,new,1000.0,producer warranty,"15"" - 15.9""",3799.0


In [9]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1571 entries, 3849 to 4277
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   graphic card type       1471 non-null   object 
 1   communications          1409 non-null   object 
 2   resolution (px)         1442 non-null   object 
 3   CPU cores               1571 non-null   object 
 4   RAM size                1478 non-null   object 
 5   operating system        1437 non-null   object 
 6   drive type              1485 non-null   object 
 7   input devices           1426 non-null   object 
 8   multimedia              1420 non-null   object 
 9   RAM type                1394 non-null   object 
 10  CPU clock speed (GHz)   1375 non-null   float64
 11  CPU model               1467 non-null   object 
 12  state                   1571 non-null   object 
 13  drive memory size (GB)  1467 non-null   float64
 14  warranty                1571 non-null

<a id="info"></a>
## <b><span style="color:navy">Step 2.2 | </span><span style="color:red">Data Informations</span></b>

In [10]:
train.describe()

Unnamed: 0,CPU clock speed (GHz),drive memory size (GB),buynow_price
count,4181.0,4439.0,4711.0
mean,2.342057,652.619284,3495.831195
std,0.386298,467.657354,1727.933306
min,0.8,0.0,429.0
25%,2.1,250.0,2222.075
50%,2.5,500.0,3184.0
75%,2.6,1000.0,4399.0
max,3.9,2960.0,15472.65


#### We see that there are columns with values of type 'list' as their data as which are `communications`, `operating system`, `input devices`, `multimedia`.

In [19]:
train['communications'].value_counts()

[wi-fi, bluetooth, lan 10/100/1000 mbps]                                                                                              1128
[bluetooth, lan 10/100 mbps]                                                                                                           656
[bluetooth, lan 10/100/1000 mbps, lan 10/100 mbps, intel wireless display (widi), nfc (near field communication), modem 3g (wwan)]     429
[bluetooth, lan 10/100/1000 mbps]                                                                                                      395
[wi-fi, bluetooth, lan 10/100 mbps]                                                                                                    306
                                                                                                                                      ... 
[wi-fi 802.11 a/b/g/n/ac, wi-fi 802.11 b/g/n, wi-fi 802.11 a/b/g/n, bluetooth, lan 10/100 mbps]                                          1
[lan 10/100/1000 mbps, nfc 

In [18]:
test['communications'].value_counts()

[wi-fi, bluetooth, lan 10/100/1000 mbps]                                                                                              371
[bluetooth, lan 10/100 mbps]                                                                                                          203
[bluetooth, lan 10/100/1000 mbps, lan 10/100 mbps, intel wireless display (widi), nfc (near field communication), modem 3g (wwan)]    141
[wi-fi, bluetooth, lan 10/100 mbps]                                                                                                   136
[bluetooth, lan 10/100/1000 mbps]                                                                                                     115
                                                                                                                                     ... 
[wi-fi, bluetooth, lan 10/100 mbps, modem 4g (lte)]                                                                                     1
[wi-fi, lan 10/100/1000 mbps]     