In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Cleaning

In [None]:
df = pd.read_csv('/kaggle/input/laptop-price-dataset/laptop_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df = df.drop('Unnamed: 0',axis=1)

In [None]:
df['Ram'] = df['Ram'].str.replace('GB','')
df['Weight'] = df['Weight'].str.replace('kg','')

In [None]:
df.head()

In [None]:
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.info()

# EDA & Feature Engineering

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot(df['Price'])

In [None]:
df['Company'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df['TypeName'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['TypeName'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.distplot(df['Inches'])

In [None]:
sns.scatterplot(x=df['Inches'], y = df['Price'])

In [None]:
df['ScreenResolution'].value_counts().plot(kind='bar')

In [None]:
df['ScreenResolution'].value_counts()

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

In [None]:
df.sample(10)

In [None]:
df['Touchscreen'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Touchscreen'],y=df['Price'])


In [None]:
df['IPS'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

In [None]:
df.sample(5)

In [None]:
df['IPS'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['IPS'],y=df['Price'])


In [None]:
new = df['ScreenResolution'].str.split('x',n=1,expand=True)

In [None]:
df['x_res'] = new[0]
df['y_res'] = new[1]

In [None]:
df.head()

In [None]:
df['x_res'] = df['x_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x : x[0])

In [None]:
df.head()

In [None]:
df['x_res'] = df['x_res'].astype('int')
df['y_res'] = df['y_res'].astype('int')


In [None]:
df.info()

In [None]:
df['ppi'] = (((df['x_res']**2)+(df['y_res']**2))**.5/df['Inches']).astype('float')

In [None]:
df.drop(columns=['ScreenResolution'],inplace=True)

In [None]:
df.drop(columns=['Inches','x_res','y_res'],inplace=True)

In [None]:
df['Cpu'].value_counts()

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x :" ".join(x.split()[0:3]))

In [None]:
df.head()

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'
        

In [None]:
df['Cpu Brand'] = df['Cpu Name'].apply(fetch_processor)

In [None]:
df['Cpu Brand'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Cpu Brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)

In [None]:
df.head()

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df['Memory'].value_counts()


In [None]:
df.drop('Memory',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['Gpu'].value_counts()

In [None]:
df['Gpu Brand'] = df['Gpu'].apply(lambda x:x.split()[0])

In [None]:
df['Gpu Brand'].value_counts()

In [None]:
df = df[df['Gpu Brand'] != 'ARM'] 

In [None]:
sns.barplot(x=df['Gpu Brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.drop('Gpu',axis=1,inplace=True)

In [None]:
df['OpSys'].value_counts()

In [None]:
sns.barplot(x=df['OpSys'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:


def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'



In [None]:
df['os'] = df['OpSys'].apply(cat_os)

In [None]:
df.drop('OpSys',axis=1,inplace=True)

In [None]:
sns.barplot(x=df['os'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.distplot(df['Weight'])

In [None]:
sns.scatterplot(x=df['Weight'],y=df['Price'])

In [None]:
sns.distplot(np.log(df['Price']))

# Spliting Data 

In [None]:
x = df.drop('Price',axis=1)
y = np.log(df['Price'])

In [None]:
x

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=2)

# Train Models

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error


from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor



In [None]:
# Linear regression


step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,8,9])
],remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))




In [None]:

# Ridge Regression


step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,8,9])
],remainder='passthrough')

step2 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))



In [None]:
# Random Forest
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,8,9])
],remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

In [None]:
# Gradient Boost
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,8,9])
],remainder='passthrough')

step2 = GradientBoostingRegressor(n_estimators=500)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))