In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score 

In [None]:
df = pd.read_csv('laptop_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.head()

In [None]:
df['Ram'] = df['Ram'].str.replace('GB','')
df['Weight'] = df['Weight'].str.replace('kg','')

In [None]:
df.head()

In [None]:
df['Ram'] = df['Ram'].astype('int32')
df['Weight'] = df['Weight'].astype('float32')

In [None]:
df.info()

In [None]:
import seaborn as sns

In [None]:
sns.distplot(df['Price'])

In [None]:
df['Company'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df['TypeName'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['TypeName'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.distplot(df['Inches'])

In [None]:
sns.scatterplot(x=df['Inches'],y=df['Price'])

In [None]:
df['ScreenResolution'].value_counts()

In [None]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)


In [None]:
df.sample(5)

In [None]:
df['Touchscreen'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Touchscreen'],y=df['Price'])

In [None]:
df['Ips'] = df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [None]:
df.head()

In [None]:
df['Ips'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ips'],y=df['Price'])

In [None]:
new = df['ScreenResolution'].str.split('x',n=1,expand=True)

In [None]:
df['X_res'] = new[0]
df['Y_res'] = new[1]

In [None]:
df.sample(5)

In [None]:
df['X_res'] = df['X_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])

In [None]:
df.head()

In [None]:
df['X_res'] = df['X_res'].astype('int')
df['Y_res'] = df['Y_res'].astype('int')
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')

In [None]:
df.info()

In [None]:
df.drop(columns=['ScreenResolution','Inches','X_res','Y_res'],inplace=True)

In [None]:
df.head()

In [None]:
df['Cpu'].value_counts()

In [None]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))

In [None]:
df.head()

In [None]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'

In [None]:
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)

In [None]:
df.head()

In [None]:
df['Cpu brand'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Cpu brand'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)

In [None]:
df.head()

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'],y=df['Price'])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
df['Memory'].value_counts()

In [None]:
df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
new = df["Memory"].str.split("+", n = 1, expand = True)

df["first"]= new[0]
df["first"]=df["first"].str.strip()

df["second"]= new[1]

df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['first'] =df['first'].str.extract('(\d+)').astype(float).fillna(0)

df["second"].fillna("0", inplace = True)

df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['second'] = df['second'].str.extract('(\d+)').astype(float).fillna(0)

df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)

df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]+df["second"]*df["Layer2Flash_Storage"])

df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2Flash_Storage','Memory'],inplace=True)

In [None]:
df.head()

In [None]:
df.select_dtypes(include=[np.number]).corr()["Price"]

In [None]:
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)

In [None]:
df["Gpu"].value_counts()

In [None]:
df["Gpu"] = df["Gpu"].apply(lambda x:x.split()[0])

In [None]:
df["Gpu"].value_counts()

In [None]:
df = df[df["Gpu"]!="ARM"]
df["Gpu brand"] = df['Gpu']
df.drop(columns=['Gpu'],inplace=True)

In [None]:
df["Gpu brand"].value_counts()

In [None]:
sns.barplot(x=df["Gpu brand"],y=df["Price"])

In [None]:
df.head()

In [None]:
df["OpSys"].value_counts()

In [None]:
sns.barplot(x=df["OpSys"],y=df["Price"])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
def extract_os(inp):
    if "Windows" in inp:
        return "Windows"
    elif inp=="macOS" or inp=="Mac":
        return "Mac"
    else:
        return "Linux/Other/No OS"

In [None]:
df["os"] = df["OpSys"].apply(extract_os)
df.drop(columns=['OpSys'],inplace=True)

In [None]:
df["os"].value_counts()

In [None]:
sns.barplot(x=df["os"],y=df["Price"])
plt.xticks(rotation="vertical")
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=[np.number]) 
correlations = numeric_df.corr()

In [None]:
sns.heatmap(correlations)

In [None]:
sns.distplot(df["Price"])

In [None]:
sns.distplot(np.log(df["Price"]))

In [None]:
X = df.drop(columns=["Price"])
y = np.log(df["Price"])

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
X_train

In [None]:
y_train

# Decision Tree

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

# Random Forest

In [None]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])
],remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe_rf = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe_rf.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

In [None]:
import pickle

pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
df.head()

In [None]:
z = np.array(X_test)[0]

In [None]:
z = z.reshape(1,12)

In [None]:
pipe.predict(z)

In [None]:
q = np.array(['HP', 'Notebook', 4, 1.4900000095367432, 0, 1, 165.6321180513006,
       'Intel Core i5', 500, 0, 'Intel', 'Windows'], dtype=object)

In [None]:
q

In [None]:
w = pickle.load(pipe,open('pipe.pkl','wb'))

In [None]:
print(w)