# Amazon Monitor EDA

In [2]:
import pandas as pd #type: ignore
import numpy as np #type: ignore
import matplotlib.pyplot as plt #type: ignore
import seaborn as sns #type: ignore

In [3]:
file = 'extracted_product_info_amazon.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8 Inches,FHD 1080p,16:9,4.4,94.99
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5 Inches,FHD 1080p,16:9,4.6,259.99
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27 Inches,FHD 1080p,16:9,4.5,99.99
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30 Inches,FHD 1080p Ultra Wide,21:9,4.5,199.97
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5 Inches,4K UHD 2160p,16:9,4.3,279.99


In [4]:
df.shape

(947, 7)

The dataset has 947 rows and 7 columns.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947 entries, 0 to 946
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         947 non-null    object
 1   Brand         947 non-null    object
 2   Screen Size   947 non-null    object
 3   Resolution    947 non-null    object
 4   Aspect Ratio  947 non-null    object
 5   Rating        947 non-null    object
 6   Price         947 non-null    object
dtypes: object(7)
memory usage: 51.9+ KB


All of the columns are object datatypes. Price, rating, and size need to converted into float datatype.

In [6]:
df.describe()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
count,947,947,947,947,947,947.0,947.0
unique,296,62,57,41,16,25.0,166.0
top,"ASUS 31.5” 1080P Monitor (VA329HE) - Full HD, ...",acer,27 Inches,FHD 1080p,16:9,4.6,199.99
freq,84,532,260,566,833,368.0,122.0


In [7]:
#change price to float
df['Price'] = df['Price'].str.replace(',', '').astype(float)

In [8]:
#dropping all rows that contain 'previous' for the rating
df['Rating'] = df['Rating'].astype(str)
index = df[df['Rating'].str.contains('Previous')].index

df = df.drop(index)

#changing rating to float
df['Rating'] = df['Rating'].astype(float)

In [9]:
#changing screen size to float
df['Screen Size'] = df['Screen Size'].astype(str)
df['Screen Size'] = df['Screen Size'].str.replace('Inches','').astype(float)

In [10]:
# removing all numeric outliers
def remove_outliers2(df):
    q1 = df.select_dtypes('number').quantile(0.25)
    q3 = df.select_dtypes('number').quantile(0.75)
    iqr = q3-q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    return df[~((df.select_dtypes('number') < lower_bound) | (df.select_dtypes('number') > upper_bound)).any(axis=1)]

df2 = remove_outliers2(df)
df2

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8,FHD 1080p,16:9,4.4,94.99
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5,FHD 1080p,16:9,4.6,259.99
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27.0,FHD 1080p,16:9,4.5,99.99
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30.0,FHD 1080p Ultra Wide,21:9,4.5,199.97
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5,4K UHD 2160p,16:9,4.3,279.99
...,...,...,...,...,...,...,...
941,"Acer Nitro 31.5"" WQHD 2560 x 1440 1500R Curved...",acer,31.5,QHD Wide 1440p,16:9,4.6,219.99
942,"acer SB270 G0bi 27"" IPS Full HD (1920 x 1080) ...",acer,27.0,FHD 1080p,16:9,4.5,147.35
943,"ASUS 31.5” 1080P Monitor (VA329HE) - Full HD, ...",ASUS,32.0,FHD 1080p,16:9,4.5,157.99
944,"Acer Predator XB273K V3bmiiprx 27"" UHD 3840x21...",acer,27.0,4K UHD 2160p,16:9,4.4,399.99


In [12]:
brands_list = df2['Brand'].value_counts().index
res_list = df2['Resolution'].value_counts().index
aspect_list = df2['Aspect Ratio'].value_counts().index

## Correlation

### One-Hot Encoding

In [28]:
from sklearn import preprocessing

categorical_cols = ['Brand', 'Resolution','Aspect Ratio']
enc = preprocessing.OneHotEncoder(sparse_output=False)
one_hot = enc.fit_transform(df2[categorical_cols])
# one_hot = enc.transform(df2[['Brand', 'Resolution','Aspect Ratio']]).toarray()
one_hot_df = pd.DataFrame(one_hot, columns=enc.get_feature_names_out(categorical_cols))
df2_encoded = pd.concat([df2.drop(categorical_cols, axis=1), one_hot_df], axis=1)

df2_encoded

df2.dropna()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price,acer,ASUS,AOPEN,...,kasorey,Alienware,Philips Computer Monitors,CIDETTY,cocopar,Cevaton,KYY,SANSUI,Teamgee,Atdec
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8,FHD 1080p,16:9,4.4,94.99,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5,FHD 1080p,16:9,4.6,259.99,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27.0,FHD 1080p,16:9,4.5,99.99,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30.0,FHD 1080p Ultra Wide,21:9,4.5,199.97,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5,4K UHD 2160p,16:9,4.3,279.99,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,"Acer Nitro 31.5"" WQHD 2560 x 1440 1500R Curved...",acer,31.5,QHD Wide 1440p,16:9,4.6,219.99,1,0,0,...,0,0,0,0,0,0,0,0,0,0
942,"acer SB270 G0bi 27"" IPS Full HD (1920 x 1080) ...",acer,27.0,FHD 1080p,16:9,4.5,147.35,1,0,0,...,0,0,0,0,0,0,0,0,0,0
943,"ASUS 31.5” 1080P Monitor (VA329HE) - Full HD, ...",ASUS,32.0,FHD 1080p,16:9,4.5,157.99,0,1,0,...,0,0,0,0,0,0,0,0,0,0
944,"Acer Predator XB273K V3bmiiprx 27"" UHD 3840x21...",acer,27.0,4K UHD 2160p,16:9,4.4,399.99,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df2['Brand'].unique().size

37

## Linear Regression Model

In [251]:
df2.head()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price,UHD,FHD,acer,...,31.5,23.8,32.0,34.0,24.5,24.0,21.5,14.0,15.6,22.0
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8,FHD 1080p,16:9,4.4,94.99,False,True,1,...,0,1,0,0,0,0,0,0,0,0
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5,FHD 1080p,16:9,4.6,259.99,False,True,1,...,1,0,0,0,0,0,0,0,0,0
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27.0,FHD 1080p,16:9,4.5,99.99,False,True,1,...,0,0,0,0,0,0,0,0,0,0
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30.0,FHD 1080p Ultra Wide,21:9,4.5,199.97,False,True,0,...,0,0,0,0,0,0,0,0,0,0
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5,4K UHD 2160p,16:9,4.3,279.99,True,False,0,...,1,0,0,0,0,0,0,0,0,0


In [252]:
# creating inputs and targets
input_list = col_list
input_list.remove('Price')
inputs = df2[input_list]
targets = df2['Price']


In [253]:
from sklearn.linear_model import LinearRegression #type: ignore
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(inputs, targets)
predictions = model.predict(inputs)
loss = mean_squared_error(predictions, targets)**0.5
loss


38.09422909176828

In [254]:
print(model.score(inputs, targets))

0.8287116537677413


In [256]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(inputs, targets, test_size=0.2)
model = LinearRegression()
model.fit(train_input, train_target)
predictions = model.predict(test_input)
loss = mean_squared_error(predictions, test_target)**0.5
loss

53.51153606214052