# Laptop Price prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick

In [None]:
df = pd.read_csv('../input/laptop-price/laptop_price.csv',encoding = "ISO-8859-1")

In [None]:
df.info()

# Featuring Engineering

In [None]:
df.select_dtypes(exclude=['float64','int64']).describe()

In [None]:
df = df.drop('Product', axis=1)
df

In [None]:
df['Weight'] = df['Weight'].str.extract(r'(\d.\d+|\d.\d|\d)')

In [None]:
df['Weight'] = df['Weight'].astype('float64')

In [None]:
df.Cpu.unique()

In [None]:
df['Cpu_model'] = df['Cpu'].str.extract(r'(i\d|AMD|Samsung|Pentium|Celeron|Atom|Core M|Xeon)')

In [None]:
df.Cpu_model.isnull().sum()

In [None]:
df['Cpu_freq'] = df['Cpu'].str.extract(r'(\d+\.\d+GHz|\d+GHz)')

In [None]:
df['Cpu_freq'] = df['Cpu_freq'].str.replace('GHz','').astype('float64')

In [None]:
df['Cpu_freq'].isnull().sum()

In [None]:
df.ScreenResolution.unique()

In [None]:
df['Resolution'] = df.ScreenResolution.str.extract(r'(\d+x\d+)')

In [None]:
df.Resolution.isnull().sum()

In [None]:
df.Memory.unique()

In [None]:
df['Memory_Size'] = df.Memory.str.extract(r'(\d+GB|\d+TB)')

In [None]:
df['Memory_Size'] = df.Memory_Size.str.extract(r'(\d+)').astype('int64')

In [None]:
df['Memory_Size'] = list((map((lambda x: x*1000 if x < 16 else x),df['Memory_Size'])))

In [None]:
df['Storage_type'] = df.Memory.str.extract(r'(.+[+].+|SSD|HDD|Flash Storage|Hybrid)')

In [None]:
df['Storage_type'].isnull().sum()

In [None]:
df.Gpu.unique()

In [None]:
df['Gpu_brand'] = df.Gpu.str.extract(r'(Nvidia|AMD|Intel)')

In [None]:
print([col for col in df.columns if df[col].isnull().sum() > 0])

In [None]:
df.Ram.unique()

In [None]:
df['Ram'] = df.Ram.str.extract(r'(\d+)').astype('int64')

In [None]:
df.info()

In [None]:
df1 = df.drop(['ScreenResolution','Cpu','Memory','Gpu','laptop_ID'], axis= 1)

In [None]:
df1.select_dtypes(exclude=['float64','int64']).describe()

In [None]:
df1.dropna(inplace=True)

In [None]:
df1.columns

# Exploratory data analisys

## Which company has higher sales?

In [None]:
company_sales = df1.groupby('Company').sum().sort_values(by = 'Price_euros', ascending = False)
company_sales
plt.figure(figsize=(15,5))
plt.xticks(rotation=90)
sns.barplot(data = company_sales, x= company_sales.index , y= 'Price_euros')

## How many Ram memory customers prefer buying a notebook?

In [None]:
ram_sales = df1.groupby('Ram').count().sort_values(by = 'Company', ascending = True)
sales_pct = list(map(lambda x: (x/1302)*100, ram_sales.Company ))

fig4, ax4 = plt.subplots(figsize=(10,8))
ax4.pie(sales_pct, startangle= 90, labels = list(map(lambda x: f'{x:.1f}%',sales_pct)), rotatelabels= True)
ax4.legend(list(map(lambda x: f'{x} Mb RAM',ram_sales.index)), loc = 'best')
plt.show()


## Which OS is preferred ?

In [None]:
os_sales = df1.groupby('OpSys').count().sort_values(by = 'Company', ascending = False)
pct = list(map(lambda x: (x / 1302)*100, os_sales.Company))

fig, ax1 = plt.subplots(figsize=(10,5))
sns.barplot(data = os_sales, x= os_sales.index, y= pct)
ax1.yaxis.set_major_formatter(mtick.PercentFormatter())

plt.show()

## Which Cpu frequency customers choose?

In [None]:
freq_sales = df1.groupby('Cpu_freq').count().sort_values(by = 'Company', ascending = False)

freq_pct = list(map(lambda x: (x/1302)*100, freq_sales.Company))

fig2, ax2 = plt.subplots(figsize = (10,5))
sns.barplot(x = freq_sales.index, y = freq_pct)
ax2.yaxis.set_major_formatter(mtick.PercentFormatter())

## Which resolution do the customers prefer?

In [None]:
resolution_sales = df1.groupby(by = 'Resolution').count().sort_values(by = 'Company', ascending = False)

res_pct = list(map(lambda x: (x/1302)*100, resolution_sales.Company))

fig, ax3 = plt.subplots(figsize = (10,5))
sns.barplot(data = resolution_sales, x = resolution_sales.index, y = res_pct)
ax3.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.xticks(rotation = 90)
plt.show()

## Which variables are correlated to the price?

In [None]:
fig, ax = plt.subplots(figsize = (10,5))
sns.heatmap(df1.corr(), annot = True)
plt.show()

In [None]:
df1.info()

# Preprocessing and Modeling

In [None]:
x = df1.drop('Price_euros', axis=1)
y = df1.Price_euros

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y)

In [None]:
good_cat_cols = [col for col in x_train.columns if x_train[col].nunique() < 10 and x_train[col].dtype == 'object']
bad_cat_cols = [col for col in x_train.columns if x_train[col].nunique() > 10 and x_train[col].dtype == 'object']
num_cols = [col for col in x_train.columns if x_train[col].dtype != 'object']

In [None]:
cat_transf = OneHotEncoder(handle_unknown='ignore', sparse = False)
num_transf = MinMaxScaler(feature_range= (-1, 1))

In [None]:
preprocessor = ColumnTransformer(transformers = [('num', num_transf, num_cols),('cat', cat_transf, good_cat_cols)])

In [None]:
num_cols

## XGboost model

In [None]:
model = XGBRegressor(n_estimators = 10)

In [None]:
clf = Pipeline(steps= [('preprocessor',preprocessor),('model', model)])

In [None]:
clf.fit(x_train, y_train)

In [None]:
preds = clf.predict(x_val)

In [None]:
xgb_r2 = r2_score(y_val, preds)

In [None]:
mean_absolute_error(y_val, preds)

## Random Forest model

In [None]:
model1 = RandomForestRegressor(n_estimators = 10)

In [None]:
clf2 = Pipeline(steps=[('preprocessor',preprocessor),('model1', model1)])

In [None]:
clf2.fit(x_train, y_train)

In [None]:
preds2 = clf2.predict(x_val)

In [None]:
rf_r2 = r2_score(y_val, preds2)

In [None]:
print(f'R2 for Random forest regression = {rf_r2:.2f}\nR2 for XGBoost Regressor = {xgb_r2:.2f}')

In [None]:
cat_features = preprocessor.transformers_[1][1].get_feature_names().tolist()

In [None]:
feature_names = num_cols

In [None]:
feature_names.extend(cat_features)

## What is the most important feature for price?

In [None]:
plt.figure(figsize = (10,5))
sns.barplot(x = feature_names ,y= model.feature_importances_)
plt.xticks(rotation = 90)
plt.title('Feature importance')
plt.show()