# Amazon Monitor EDA

In [329]:
import pandas as pd #type: ignore
import numpy as np #type: ignore
import matplotlib.pyplot as plt #type: ignore
import seaborn as sns #type: ignore

In [330]:
file = 'extracted_product_info_amazon.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8 Inches,FHD 1080p,16:9,4.4,94.99
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5 Inches,FHD 1080p,16:9,4.6,259.99
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27 Inches,FHD 1080p,16:9,4.5,99.99
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30 Inches,FHD 1080p Ultra Wide,21:9,4.5,199.97
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5 Inches,4K UHD 2160p,16:9,4.3,279.99


In [331]:
df.shape

(947, 7)

The dataset has 947 rows and 7 columns.

In [332]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947 entries, 0 to 946
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         947 non-null    object
 1   Brand         947 non-null    object
 2   Screen Size   947 non-null    object
 3   Resolution    947 non-null    object
 4   Aspect Ratio  947 non-null    object
 5   Rating        947 non-null    object
 6   Price         947 non-null    object
dtypes: object(7)
memory usage: 51.9+ KB


All of the columns are object datatypes. Price, rating, and size need to converted into float datatype.

In [333]:
df.describe()

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
count,947,947,947,947,947,947.0,947.0
unique,296,62,57,41,16,25.0,166.0
top,"ASUS 31.5” 1080P Monitor (VA329HE) - Full HD, ...",acer,27 Inches,FHD 1080p,16:9,4.6,199.99
freq,84,532,260,566,833,368.0,122.0


In [334]:
#change price to float
df['Price'] = df['Price'].str.replace(',', '').astype(float)

In [335]:
#dropping all rows that contain 'previous' for the rating
df['Rating'] = df['Rating'].astype(str)
index = df[df['Rating'].str.contains('Previous')].index

df = df.drop(index)

#changing rating to float
df['Rating'] = df['Rating'].astype(float)

In [336]:
#changing screen size to float
df['Screen Size'] = df['Screen Size'].astype(str)
df['Screen Size'] = df['Screen Size'].str.replace('Inches','').astype(float)

In [337]:
# removing all numeric outliers
def remove_outliers2(df):
    q1 = df.select_dtypes('number').quantile(0.25)
    q3 = df.select_dtypes('number').quantile(0.75)
    iqr = q3-q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    return df[~((df.select_dtypes('number') < lower_bound) | (df.select_dtypes('number') > upper_bound)).any(axis=1)]

df2 = remove_outliers2(df)
df2

Unnamed: 0,Title,Brand,Screen Size,Resolution,Aspect Ratio,Rating,Price
0,"acer SB240Y G0bi 23.8"" IPS Full HD Ultra-Slim ...",acer,23.8,FHD 1080p,16:9,4.4,94.99
1,"acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",acer,31.5,FHD 1080p,16:9,4.6,259.99
2,"Acer SB272 EBI 27"" Full HD (1920 x 1080) IPS Z...",acer,27.0,FHD 1080p,16:9,4.5,99.99
3,Sceptre 30-inch Curved Gaming Monitor 21:9 256...,Sceptre,30.0,FHD 1080p Ultra Wide,21:9,4.5,199.97
4,"SAMSUNG 32"" UJ59 Series 4K UHD (3840x2160) Com...",SAMSUNG,31.5,4K UHD 2160p,16:9,4.3,279.99
...,...,...,...,...,...,...,...
941,"Acer Nitro 31.5"" WQHD 2560 x 1440 1500R Curved...",acer,31.5,QHD Wide 1440p,16:9,4.6,219.99
942,"acer SB270 G0bi 27"" IPS Full HD (1920 x 1080) ...",acer,27.0,FHD 1080p,16:9,4.5,147.35
943,"ASUS 31.5” 1080P Monitor (VA329HE) - Full HD, ...",ASUS,32.0,FHD 1080p,16:9,4.5,157.99
944,"Acer Predator XB273K V3bmiiprx 27"" UHD 3840x21...",acer,27.0,4K UHD 2160p,16:9,4.4,399.99


In [338]:
df2['UHD'] = df2['Resolution'].str.contains('UHD')
df2['FHD'] = df2['Resolution'].str.contains('FHD' or 'Full HD')
df2['Gaming'] = df2['Title'].str.contains('Gaming')
df2['Curved'] = df2['Title'].str.contains('Curved')
df2['HDR'] = df2['Title'].str.contains('HDR')
df2['Adaptive'] = df2['Title'].str.contains('Adaptive')
df2['Blue_Light'] = df2['Title'].str.contains('Blue Light')
df2['AMD_FreeSync'] = df2['Title'].str.contains('AMD FreeSync')
df2['Height_Adjust'] = df2['Title'].str.contains('Height Adjustable')
df2['Tilt'] = df2['Title'].str.contains('Tilt')
df2['Ultra_Thin'] = df2['Title'].str.contains('Ultra Thin' or 'Ultra-Thin')
# df2['Ultra_Wide'] = df2['Title'].str.contains('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['UHD'] = df2['Resolution'].str.contains('UHD')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['FHD'] = df2['Resolution'].str.contains('FHD' or 'Full HD')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Gaming'] = df2['Title'].str.contains('Gaming')
A value is trying to be set on a copy 

## Correlation

### One-Hot Encoding

In [339]:
from sklearn import preprocessing

categorical_cols = ['Brand', 'Resolution','Aspect Ratio']
enc = preprocessing.OneHotEncoder(sparse_output=False)

one_hot = enc.fit_transform(df2[categorical_cols])
new_columns = enc.get_feature_names_out(categorical_cols)
one_hot_df = pd.DataFrame(one_hot, columns=new_columns)

one_hot_df = one_hot_df.reset_index(drop=True)
df2 = df2.reset_index(drop=True)

df2_encoded = pd.concat([df2.drop(columns = categorical_cols), one_hot_df], axis=1)

df2_encoded.shape

(822, 82)

## Linear Regression Model

In [340]:
df2_encoded = df2_encoded.drop(columns='Title')
df2_encoded.head()

Unnamed: 0,Screen Size,Rating,Price,UHD,FHD,Gaming,Curved,HDR,Adaptive,Blue_Light,...,Resolution_XGA,Resolution_XGA+ Wide,Aspect Ratio_1.27:1,Aspect Ratio_1.77:1,Aspect Ratio_1.78:1,Aspect Ratio_16:10,Aspect Ratio_16:9,Aspect Ratio_2.35:1,Aspect Ratio_21:9,Aspect Ratio_Unknown
0,23.8,4.4,94.99,False,True,True,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,31.5,4.6,259.99,False,True,True,True,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,27.0,4.5,99.99,False,True,True,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,30.0,4.5,199.97,False,True,True,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,31.5,4.3,279.99,True,False,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [341]:
# creating inputs and targets
inputs = df2_encoded.loc[:, df2_encoded.columns != 'Price']
targets = df2_encoded['Price']

In [342]:
from sklearn.linear_model import LinearRegression #type: ignore
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(inputs, targets)
predictions = model.predict(inputs)
loss = mean_squared_error(predictions, targets)**0.5
loss


25.914902087739073

In [343]:
print(model.score(inputs, targets))

0.9207300503222319


In [350]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(inputs, targets, test_size=0.2)
model = LinearRegression()
model.fit(train_input, train_target)
predictions = model.predict(test_input)

print(model.score(inputs, targets))

# loss = mean_squared_error(predictions, test_target)**0.5
# loss

0.8972148547738183
