In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('laptop_price.csv',encoding='latin-1')
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


Laptop_Id is not really useful

In [4]:
df.drop('laptop_ID',inplace=True,axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   Product           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price_euros       1303 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


## Data Preprocessing
1. Changing currency from euros to rupees
2. Converting weight to int dtype.
3. Splitting memory column into storage and type (+extra memory).
4. Converting RAM into int dtype
5. Splitting processor column in name and frequency.
6. Just keeping the resolution and not the screen type(for the sake of simplicity of model).


In [6]:
#exchange rate 1Euro=78rupee
df['Price_rupees']=round(df['Price_euros']*56,0)
df.drop('Price_euros',axis=1,inplace=True)
df['Price_rupees']=df['Price_rupees'].astype(int)

In [7]:
df['Price_rupees'].mean()

62926.45126630852

In [8]:
#converting weight float dtype
df['Weight']=df['Weight'].str.split('kg',1,expand=True)[0]
df['Weight']=df['Weight'].astype(float)

In [9]:
#Splitting memory column into storage and type (+extra memory).
df[['Memory','Memory_type']] =df['Memory'].str.split(' ',1,expand=True)

def getmemory(str):
    return re.findall(r'[0-9]+', str)

num=[]
for i in range(0,len(df)):
    num.append(df['Memory'].apply(getmemory)[i][0])

df['Memory_num']=num
df['Memory_num']=df['Memory_num'].astype(int)

tborgb=[]
for i in range(0,len(df)):
    tborgb.append(re.findall(r'[a-zA-Z]+',df['Memory'][i])[0])

df['TB']=tborgb

for i in range(len(df)):
    if df.loc[i, "TB"]== 'TB':
        df.loc[i, "Memory_num"] = df.loc[i, "Memory_num"]*1000

df.drop(['Memory','TB'],axis=1,inplace=True)

df.rename(columns={'Memory_num':'Memory_in_GB'},inplace=True)

In [10]:
#converting RAM to int dtype
ram=[]
for i in range(len(df)):
    ram.append(re.findall(r'[1-9]+',df['Ram'][i])[0])

df['Ram']=ram
df['Ram']=df['Ram'].astype(int)

df.rename(columns={'Ram':'Ram_in_GB'},inplace=True)

In [11]:
#Changing the screen resolution to just heightxwidth for sake of simplicity
scrres=[]
for i in range(len(df)):
    scrres.append(df.ScreenResolution.str.split(' ')[i][-1])

df['ScreenResolution']=scrres

In [12]:
#Changing the Cpu name to just the company name
cpu=[]
for i in range(len(df)):
    cpu.append(df.Cpu.str.split(' ')[i][0])

df['Cpu']=cpu

In [13]:
#Keeping just the company name for Gpu
df.Gpu=df.Gpu.str.split(' ',2,expand=True)[0]+' '+df.Gpu.str.split(' ',2,expand=True)[1]

In [14]:
#Summing up opsys
def get_os(a):
    if a=='Windows 10'or a=='Windows 10 S'or a=='Windows 7':
        return 'Windows'
    elif a=='macOS'or a=='Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
    
df['OpSys']=df['OpSys'].apply(get_os) 

In [15]:
#number of unique models under each brand name
df.groupby('Company')['Product'].nunique()

Company
Acer          55
Apple          4
Asus         127
Chuwi          3
Dell          62
Fujitsu        3
Google         1
HP           122
Huawei         1
LG             3
Lenovo       138
MSI           47
Mediacom       6
Microsoft      1
Razer          2
Samsung        4
Toshiba       36
Vero           4
Xiaomi         1
Name: Product, dtype: int64

In [16]:
#Creating a 'Others' for company and product by taking a random sample of df 
#and renaming rows for company and product as others
others=df.sample(frac=0.2,random_state=2,ignore_index=True)
others[['Company','Product','OpSys']]='Others'
df_final=pd.concat([df,others],ignore_index=True)

df_final=df_final.sample(frac=1,random_state=2).reset_index(drop=True)

In [17]:
df_final.drop('Product',axis=1,inplace=True)

In [18]:
df_final.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram_in_GB,Gpu,OpSys,Weight,Price_rupees,Memory_type,Memory_in_GB
0,HP,Workstation,15.6,1920x1080,Intel,8,Nvidia Quadro,Windows,2.59,124824,SSD,256
1,Dell,Notebook,15.6,1366x768,Intel,8,Intel HD,Windows,2.14,30237,HDD,1000
2,Lenovo,Notebook,15.6,1920x1080,Intel,8,Intel HD,Others/No OS/Linux,2.3,32088,SSD,256
3,Lenovo,Notebook,15.6,1920x1080,Intel,8,Nvidia Quadro,Windows,2.67,117040,SSD,256
4,MSI,Gaming,15.6,1920x1080,Intel,8,Nvidia GeForce,Windows,2.2,57553,SSD,256


In [35]:
for i in df.columns:
    print(f'unique values in column {i}')
    print(df[i].unique().tolist())
    print('-'*50)

unique values in column Company
['Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI', 'Microsoft', 'Toshiba', 'Huawei', 'Xiaomi', 'Vero', 'Razer', 'Mediacom', 'Samsung', 'Google', 'Fujitsu', 'LG']
--------------------------------------------------
unique values in column Product
['MacBook Pro', 'Macbook Air', '250 G6', 'Aspire 3', 'ZenBook UX430UN', 'Swift 3', 'Inspiron 3567', 'MacBook 12"', 'IdeaPad 320-15IKB', 'XPS 13', 'Vivobook E200HA', 'Legion Y520-15IKBN', '255 G6', 'Inspiron 5379', '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)', 'MacBook Air', 'Inspiron 5570', 'Latitude 5590', 'ProBook 470', 'LapBook 15.6"', 'E402WA-GA010T (E2-6110/2GB/32GB/W10)', '17-ak001nv (A6-9220/4GB/500GB/Radeon', 'IdeaPad 120S-14IAP', 'Inspiron 5770', 'ProBook 450', 'X540UA-DM186 (i3-6006U/4GB/1TB/FHD/Linux)', 'Inspiron 7577', 'X542UQ-GO005 (i5-7200U/8GB/1TB/GeForce', 'Aspire A515-51G', 'Inspiron 7773', 'IdeaPad 320-15ISK', 'Rog Strix', 'X751NV-TY001T (N4200/4GB/1TB/GeForce', 'Yoga Book', 'ProBook 

In [19]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1564 entries, 0 to 1563
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1564 non-null   object 
 1   TypeName          1564 non-null   object 
 2   Inches            1564 non-null   float64
 3   ScreenResolution  1564 non-null   object 
 4   Cpu               1564 non-null   object 
 5   Ram_in_GB         1564 non-null   int32  
 6   Gpu               1564 non-null   object 
 7   OpSys             1564 non-null   object 
 8   Weight            1564 non-null   float64
 9   Price_rupees      1564 non-null   int32  
 10  Memory_type       1564 non-null   object 
 11  Memory_in_GB      1564 non-null   int32  
dtypes: float64(2), int32(3), object(7)
memory usage: 128.4+ KB


## Model Building

### Using Label Encoder to encode Data

In [20]:
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

In [21]:
#To find categorical columns in df
df_cat=[]
for i in df_final.columns:
    if df_final[i].dtype=='O':
        df_cat.append(i)

In [22]:
num_cols=[]
for i in df_final.columns:
    if (df_final[i].dtype=='int32') or (df_final[i].dtype=='float64'):
        num_cols.append(i)

In [23]:
num_cols.remove('Price_rupees')

In [24]:
#Label Encoding
le=LabelEncoder()
for i in df_cat:
    df_final[i]=le.fit_transform(df_final[i])

In [25]:
#splitting the data into train test
X=df_final.drop('Price_rupees',axis=1)
y=df_final['Price_rupees']

In [26]:
sc=StandardScaler()
X[num_cols]=sc.fit_transform(X[num_cols])

In [27]:
xtr=ExtraTreesRegressor(n_estimators=100, random_state=3,  max_features=0.75, max_depth=15)

In [28]:
xtr.fit(X,y)

In [29]:
from joblib import dump, load
dump(xtr, 'pkkavla.joblib')

['pkkavla.joblib']