# Use all the regression algorithms to predict the price. Use data cleaning techniques & other data analysis techniques to make the dataset more effective.

In [321]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [322]:
df = pd.read_excel('global_laptop_selling_data.xlsx')
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [323]:
string_columns = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys']
df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())

In [324]:
df[['SR_width', 'SR_height']] = df['ScreenResolution'].str.extract(r'(\d+)\D+(\d+)').astype(int)
df['ScreenName'] = df['ScreenResolution'].str.extract(r'([A-Za-z\s]+)')
df.drop('ScreenResolution', axis=1, inplace=True)
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName
0,1,apple,macbook pro,ultrabook,13.3,intel core i5 2.3ghz,8GB,128gb ssd,intel iris plus graphics 640,macos,1.37kg,1339.69,2560,1600,ips panel retina display
1,2,apple,macbook air,ultrabook,13.3,intel core i5 1.8ghz,8GB,128gb flash storage,intel hd graphics 6000,macos,1.34kg,898.94,1440,900,x
2,3,hp,250 g6,notebook,15.6,intel core i5 7200u 2.5ghz,8GB,256gb ssd,intel hd graphics 620,no os,1.86kg,575.0,1920,1080,full hd
3,4,apple,macbook pro,ultrabook,15.4,intel core i7 2.7ghz,16GB,512gb ssd,amd radeon pro 455,macos,1.83kg,2537.45,2880,1800,ips panel retina display
4,5,apple,macbook pro,ultrabook,13.3,intel core i5 3.1ghz,8GB,256gb ssd,intel iris plus graphics 650,macos,1.37kg,1803.6,2560,1600,ips panel retina display


In [325]:
df['PBrand'] = df['Cpu'].str.split(n=1).str[0]
df['ProcessorDetails'] = df['Cpu'].str.split(n=1).str[1]
df[['PSeries', 'PSpeed']] = df['ProcessorDetails'].str.split(n=1, expand=True)
df['PSpeed'] = df['PSpeed'].str.extract(r'(\d+\.\d+)').astype(float)
df['PSeries'] = df['Cpu'].str.extract(r'(\b[Cc]ore\s[iI]\d+)\b')
df.drop(['Cpu','ProcessorDetails'], axis=1, inplace=True)

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed
0,1,apple,macbook pro,ultrabook,13.3,8GB,128gb ssd,intel iris plus graphics 640,macos,1.37kg,1339.69,2560,1600,ips panel retina display,intel,core i5,2.3
1,2,apple,macbook air,ultrabook,13.3,8GB,128gb flash storage,intel hd graphics 6000,macos,1.34kg,898.94,1440,900,x,intel,core i5,1.8
2,3,hp,250 g6,notebook,15.6,8GB,256gb ssd,intel hd graphics 620,no os,1.86kg,575.0,1920,1080,full hd,intel,core i5,2.5
3,4,apple,macbook pro,ultrabook,15.4,16GB,512gb ssd,amd radeon pro 455,macos,1.83kg,2537.45,2880,1800,ips panel retina display,intel,core i7,2.7
4,5,apple,macbook pro,ultrabook,13.3,8GB,256gb ssd,intel iris plus graphics 650,macos,1.37kg,1803.6,2560,1600,ips panel retina display,intel,core i5,3.1


In [326]:
df[['S_capacity', 'S_type']] = df['Memory'].str.split(n=1, expand=True)
df['S_capacity'] = df['S_capacity'].str.extract(r'(\d+)').astype(int)
df.drop('Memory', axis=1, inplace=True)

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,Gpu,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed,S_capacity,S_type
0,1,apple,macbook pro,ultrabook,13.3,8GB,intel iris plus graphics 640,macos,1.37kg,1339.69,2560,1600,ips panel retina display,intel,core i5,2.3,128,ssd
1,2,apple,macbook air,ultrabook,13.3,8GB,intel hd graphics 6000,macos,1.34kg,898.94,1440,900,x,intel,core i5,1.8,128,flash storage
2,3,hp,250 g6,notebook,15.6,8GB,intel hd graphics 620,no os,1.86kg,575.0,1920,1080,full hd,intel,core i5,2.5,256,ssd
3,4,apple,macbook pro,ultrabook,15.4,16GB,amd radeon pro 455,macos,1.83kg,2537.45,2880,1800,ips panel retina display,intel,core i7,2.7,512,ssd
4,5,apple,macbook pro,ultrabook,13.3,8GB,intel iris plus graphics 650,macos,1.37kg,1803.6,2560,1600,ips panel retina display,intel,core i5,3.1,256,ssd


In [327]:
df[['G_model', 'G_number']] = df['Gpu'].str.rsplit(n=1, expand=True)
df.drop('Gpu', axis=1, inplace=True)

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed,S_capacity,S_type,G_model,G_number
0,1,apple,macbook pro,ultrabook,13.3,8GB,macos,1.37kg,1339.69,2560,1600,ips panel retina display,intel,core i5,2.3,128,ssd,intel iris plus graphics,640
1,2,apple,macbook air,ultrabook,13.3,8GB,macos,1.34kg,898.94,1440,900,x,intel,core i5,1.8,128,flash storage,intel hd graphics,6000
2,3,hp,250 g6,notebook,15.6,8GB,no os,1.86kg,575.0,1920,1080,full hd,intel,core i5,2.5,256,ssd,intel hd graphics,620
3,4,apple,macbook pro,ultrabook,15.4,16GB,macos,1.83kg,2537.45,2880,1800,ips panel retina display,intel,core i7,2.7,512,ssd,amd radeon pro,455
4,5,apple,macbook pro,ultrabook,13.3,8GB,macos,1.37kg,1803.6,2560,1600,ips panel retina display,intel,core i5,3.1,256,ssd,intel iris plus graphics,650


In [328]:
df['Weight'] = df['Weight'].astype(str)
df['Weight'] = df['Weight'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

df['Ram'] = df['Ram'].astype(str)
df['Ram'] = df['Ram'].str.extract(r'(\d+\.\d+|\d+)')
df['Ram'] = df['Ram'].astype(float).fillna(0).astype(int)

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed,S_capacity,S_type,G_model,G_number
0,1,apple,macbook pro,ultrabook,13.3,8,macos,1.37,1339.69,2560,1600,ips panel retina display,intel,core i5,2.3,128,ssd,intel iris plus graphics,640
1,2,apple,macbook air,ultrabook,13.3,8,macos,1.34,898.94,1440,900,x,intel,core i5,1.8,128,flash storage,intel hd graphics,6000
2,3,hp,250 g6,notebook,15.6,8,no os,1.86,575.0,1920,1080,full hd,intel,core i5,2.5,256,ssd,intel hd graphics,620
3,4,apple,macbook pro,ultrabook,15.4,16,macos,1.83,2537.45,2880,1800,ips panel retina display,intel,core i7,2.7,512,ssd,amd radeon pro,455
4,5,apple,macbook pro,ultrabook,13.3,8,macos,1.37,1803.6,2560,1600,ips panel retina display,intel,core i5,3.1,256,ssd,intel iris plus graphics,650


In [329]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
string_columns = ['Company', 'Product','TypeName', 'ScreenName', 'PBrand','PSeries','S_type','OpSys','G_model', 'G_number','PSpeed']

for col in string_columns:
        df[col] = label_encoder.fit_transform(df[col])
df.head()


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed,S_capacity,S_type,G_model,G_number
0,1,1,289,4,13.3,8,4,1.37,1339.69,2560,1600,4,1,1,14,128,5,16,29
1,2,1,288,4,13.3,8,4,1.34,898.94,1440,900,9,1,1,8,128,0,14,25
2,3,7,50,3,15.6,8,5,1.86,575.0,1920,1080,0,1,1,16,256,5,14,27
3,4,1,289,4,15.4,16,4,1.83,2537.45,2880,1800,4,1,2,18,512,5,4,12
4,5,1,289,4,13.3,8,4,1.37,1803.6,2560,1600,4,1,1,21,256,5,16,30


In [330]:
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 laptop_ID      0
Company        0
Product        0
TypeName       0
Inches         0
Ram            0
OpSys          0
Weight         0
Price_euros    0
SR_width       0
SR_height      0
ScreenName     0
PBrand         0
PSeries        0
PSpeed         0
S_capacity     0
S_type         0
G_model        0
G_number       0
dtype: int64


In [331]:
from sklearn.ensemble import IsolationForest
outlier_detector = IsolationForest(contamination=0.05)  # Adjust contamination as needed
outlier_detector.fit(df[['Price_euros']])
outliers = outlier_detector.predict(df[['Price_euros']])
outliers

array([1, 1, 1, ..., 1, 1, 1])

In [332]:
df = df[outliers != -1]

In [333]:
df.to_excel('dataset.xlsx', index=False)
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,Ram,OpSys,Weight,Price_euros,SR_width,SR_height,ScreenName,PBrand,PSeries,PSpeed,S_capacity,S_type,G_model,G_number
0,1,1,289,4,13.3,8,4,1.37,1339.69,2560,1600,4,1,1,14,128,5,16,29
1,2,1,288,4,13.3,8,4,1.34,898.94,1440,900,9,1,1,8,128,0,14,25
2,3,7,50,3,15.6,8,5,1.86,575.0,1920,1080,0,1,1,16,256,5,14,27
3,4,1,289,4,15.4,16,4,1.83,2537.45,2880,1800,4,1,2,18,512,5,4,12
4,5,1,289,4,13.3,8,4,1.37,1803.6,2560,1600,4,1,1,21,256,5,16,30


# Linear Regression

In [334]:
ldf = df.copy()
x = ldf.drop(['laptop_ID','Price_euros'],axis='columns')
y = ldf['Price_euros']

In [335]:
from sklearn.model_selection import train_test_split as tts
xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)

In [336]:
from sklearn.linear_model import LinearRegression
li = LinearRegression()
li.fit(xtrain,ytrain)

In [337]:
from sklearn import metrics
import numpy as np

y_pred = li.predict(xtest)
y_test = np.ravel(ytest)
table = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
table.head(10)

Unnamed: 0,Actual,Predicted
0,928.0,951.955544
1,379.0,500.221845
2,1193.0,1394.510919
3,1895.0,1124.50596
4,1191.0,1230.444465
5,1867.85,1615.11581
6,649.0,845.968092
7,800.0,768.334644
8,295.0,273.741829
9,573.0,1155.188449


In [338]:
li_accuracy = li.score(xtest,ytest)
li_accuracy

0.6130330263038939

In [339]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
li_mse = mean_squared_error(ytest,y_pred)
li_mse

116823.48516761347

In [340]:
li_mae = mean_absolute_error(ytest,y_pred)
li_mae

262.9646682393406

# Decision Tree Regressor

In [341]:
ddf = df.copy()
x = ddf.drop(['laptop_ID','Price_euros'],axis='columns')
y = ddf['Price_euros']

In [342]:
xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(xtrain,ytrain)

In [343]:
y_pred = dt.predict(xtest)
y_test = np.ravel(ytest)
table = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
table.head(10)

Unnamed: 0,Actual,Predicted
0,928.0,923.0
1,379.0,369.0
2,1193.0,2229.0
3,1895.0,1099.0
4,1191.0,1199.0
5,1867.85,1340.0
6,649.0,522.99
7,800.0,797.41
8,295.0,379.0
9,573.0,750.0


In [344]:
dt_accuracy = dt.score(xtest,ytest)
dt_accuracy

0.7156378936965968

In [345]:
dt_mse = mean_squared_error(ytest,y_pred)
dt_mse

85847.56469180108

In [346]:
dt_mae = mean_absolute_error(ytest,y_pred)
dt_mae

189.0249193548387

# SVR

In [347]:
sdf = df.copy()
x = sdf.drop(['laptop_ID','Price_euros'],axis='columns')
y = sdf['Price_euros']

In [348]:
from sklearn.svm import SVR
xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)

svr = SVR()
svr.fit(xtrain, ytrain)

In [349]:
y_pred = svr.predict(xtest)
y_test = np.ravel(ytest)
table = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
table.head(10)

Unnamed: 0,Actual,Predicted
0,928.0,960.552966
1,379.0,928.988329
2,1193.0,961.908752
3,1895.0,922.773392
4,1191.0,961.692322
5,1867.85,989.919627
6,649.0,919.486061
7,800.0,959.228954
8,295.0,918.63911
9,573.0,962.28697


In [350]:
svr_accuracy = svr.score(xtest,ytest)
svr_accuracy

0.027695191123136786

In [351]:
svr_mse = mean_squared_error(ytest,y_pred)
svr_mse

293534.1880297741

In [352]:
svr_mae = mean_absolute_error(ytest,y_pred)
svr_mae

429.1109021657149

# Random Forest

In [353]:
rdf = df.copy()
x = rdf.drop(['laptop_ID','Price_euros'],axis='columns')
y = rdf['Price_euros']

In [354]:
from sklearn.ensemble import RandomForestRegressor
xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)

rd = RandomForestRegressor(n_estimators=100, random_state=42)
rd.fit(xtrain, ytrain)

In [355]:
y_pred = rd.predict(xtest)
y_test = np.ravel(ytest)
table = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
table.head(10)

Unnamed: 0,Actual,Predicted
0,928.0,941.8016
1,379.0,357.572518
2,1193.0,1553.0551
3,1895.0,1296.6778
4,1191.0,1294.5404
5,1867.85,1486.8838
6,649.0,614.1258
7,800.0,800.0922
8,295.0,352.827
9,573.0,957.0459


In [356]:
rd_accuracy = rd.score(xtest,ytest)
rd_accuracy

0.8436229167315704

In [357]:
rd_mse = mean_squared_error(ytest,y_pred)
rd_mse

47209.49618328597

In [358]:
rd_mae = mean_absolute_error(ytest,y_pred)
rd_mae

150.37122040770612

In [359]:
import pickle
filename = 'model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rd, file)

# KNN

In [360]:
kdf = df.copy()
x = kdf.drop(['laptop_ID','Price_euros'],axis='columns')
y = kdf['Price_euros']

In [361]:
from sklearn.neighbors import KNeighborsRegressor
xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(xtrain, ytrain)


In [362]:
y_pred = rd.predict(xtest)
y_test = np.ravel(ytest)
table = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
table.head(10)

Unnamed: 0,Actual,Predicted
0,928.0,941.8016
1,379.0,357.572518
2,1193.0,1553.0551
3,1895.0,1296.6778
4,1191.0,1294.5404
5,1867.85,1486.8838
6,649.0,614.1258
7,800.0,800.0922
8,295.0,352.827
9,573.0,957.0459


In [363]:
knn_accuracy = knn.score(xtest,ytest)
knn_accuracy

0.6693755015822919

In [364]:
knn_mse = mean_squared_error(ytest,y_pred)
knn_mse

47209.49618328597

In [365]:
knn_mae = mean_absolute_error(ytest,y_pred)
knn_mae

150.37122040770612

# Best Model = Random Forest

In [388]:
import gradio as gr
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

df = pd.read_excel('dataset.xlsx')

x = df.drop(['laptop_ID','Price_euros'],axis='columns')
y = df['Price_euros']


xtrain,xtest,ytrain,ytest = tts(x,y,test_size=0.3,random_state=50)
rd = RandomForestRegressor(n_estimators=100, random_state=42)
rd.fit(xtrain, ytrain)

# Define the prediction function
def predict_price(company, product, typename, inches, ram, opsys, weight, sr_width, sr_height, screenname, pbrand, pseries, pspeed, s_capacity, s_type, g_model, g_number):
    # Create a new dataframe with the input features
    input_data = pd.DataFrame({
        "Company": [company],
        "Product": [product],
        "TypeName": [typename],
        "Inches": [inches],
        "Ram": [ram],
        "OpSys": [opsys],
        "Weight": [weight],
        "SR_width": [sr_width],
        "SR_height": [sr_height],
        "ScreenName": [screenname],
        "PBrand": [pbrand],
        "PSeries": [pseries],
        "PSpeed": [pspeed],
        "S_capacity": [s_capacity],
        "S_type": [s_type],
        "G_model": [g_model],
        "G_number": [g_number]
    })

    # Make predictions
    price_prediction = rd.predict(input_data)

    return price_prediction[0]

# Define the Gradio interface
inputs = [
    gr.inputs.Textbox(label="Company"),
    gr.inputs.Textbox(label="Product"),
    gr.inputs.Textbox(label="TypeName"),
    gr.inputs.Number(label="Inches"),
    gr.inputs.Number(label="Ram"),
    gr.inputs.Textbox(label="OpSys"),
    gr.inputs.Number(label="Weight"),
    gr.inputs.Number(label="SR_width"),
    gr.inputs.Number(label="SR_height"),
    gr.inputs.Textbox(label="ScreenName"),
    gr.inputs.Textbox(label="PBrand"),
    gr.inputs.Textbox(label="PSeries"),
    gr.inputs.Textbox(label="PSpeed"),
    gr.inputs.Number(label="S_capacity"),
    gr.inputs.Textbox(label="S_type"),
    gr.inputs.Textbox(label="G_model"),
    gr.inputs.Textbox(label="G_number")
]

output = gr.outputs.Textbox(label="Price Prediction")

# Create the Gradio interface
gr.Interface(fn=predict_price, inputs=inputs, outputs=output, title="Laptop Price Predictor").launch()


Running on local URL:  http://127.0.0.1:7884

To create a public link, set `share=True` in `launch()`.




In [389]:
rd_accuracy = rd.score(xtest,ytest)
rd_accuracy

0.8436229167315704