# imports

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler , LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score , mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression , LogisticRegression
from scipy import stats


# Load Dataset and Data Preprocessing 

In [7]:
import chardet

# Detect file encoding
with open('laptop_price.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)  # Check the detected encoding

# Use the detected encoding to read the file
df = pd.read_csv('laptop_price.csv', encoding=result['encoding'])

# laptop_price
df.head()

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [8]:
df.drop('laptop_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [9]:
print(df.isnull().sum())

Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64


# Extraction, Feature Engineering and Encoding

In [11]:
# Extract numeric value from `Ram` and convert to numeric and filling failed extarctions with 0
df['Ram'] = df['Ram'].astype(str).str.extract('(\d+)').astype(float).fillna(0)

# Extract numeric value from `Weight` and convert to numeric
df['Weight'] = df['Weight'].astype(str).str.extract('(\d+\.?\d*)').astype(float).fillna(0)

# Extract screen width, height and resolution from `ScreenResolution` after extracting only the numeric part of the string
df['ScreenResolution'] = df['ScreenResolution'].astype(str).str.replace(r'[^0-9x]', '', regex=True)
df[['ScreenWidth', 'ScreenHeight']] = df['ScreenResolution'].str.split('x', expand=True).astype(float).fillna(0)
df['Resolution'] = df['ScreenWidth'] * df['ScreenHeight']
df.drop('ScreenResolution', axis=1, inplace=True)

# Extract CPU brand, speed and name from `Cpu`
df['Cpu Brand'] = df['Cpu'].str.split().str[0].fillna('Unknown')
df['Cpu Speed'] = df['Cpu'].str.extract('(\d+\.?\d*)GHz').astype(float).fillna('Unknown')
df['Cpu Name'] = df['Cpu'].str.split(n=2).str[2].fillna('Unknown')
df.drop('Cpu', axis=1, inplace=True)

# Extract memory type and storage size from `Memory`
df['Memory Type'] = df['Memory'].str.split().str[-1].fillna('Unknown')
df['Memory Size'] = df['Memory'].astype(str).str.extract('(\d+)').astype(float).fillna(0)
df.drop('Memory', axis=1, inplace=True)

# Extract GPU brand and name from `Gpu`
df['Gpu Brand'] = df['Gpu'].str.split().str[0].fillna('Unknown')
# Drop the `Gpu Name` column
df.drop('Gpu', axis=1, inplace=True)

# One-hot encode categorical columns
categorical_cols = ['Company', 'Product', 'TypeName', 'OpSys', 'Cpu Brand', 'Cpu Name', 'Memory Type', 'Gpu Brand']
df = pd.get_dummies(df, columns=categorical_cols)

  df['Ram'] = df['Ram'].astype(str).str.extract('(\d+)').astype(float).fillna(0)
  df['Weight'] = df['Weight'].astype(str).str.extract('(\d+\.?\d*)').astype(float).fillna(0)
  df['Cpu Speed'] = df['Cpu'].str.extract('(\d+\.?\d*)GHz').astype(float).fillna('Unknown')
  df['Memory Size'] = df['Memory'].astype(str).str.extract('(\d+)').astype(float).fillna(0)


In [12]:
df

Unnamed: 0,Inches,Ram,Weight,Price_euros,ScreenWidth,ScreenHeight,Resolution,Cpu Speed,Memory Size,Company_Acer,...,Cpu Name_x5-Z8350 1.44GHz,Cpu Name_x5-Z8550 1.44GHz,Memory Type_HDD,Memory Type_Hybrid,Memory Type_SSD,Memory Type_Storage,Gpu Brand_AMD,Gpu Brand_ARM,Gpu Brand_Intel,Gpu Brand_Nvidia
0,13.3,8.0,1.37,1339.69,2560.0,1600.0,4096000.0,2.3,128.0,False,...,False,False,False,False,True,False,False,False,True,False
1,13.3,8.0,1.34,898.94,1440.0,900.0,1296000.0,1.8,128.0,False,...,False,False,False,False,False,True,False,False,True,False
2,15.6,8.0,1.86,575.00,1920.0,1080.0,2073600.0,2.5,256.0,False,...,False,False,False,False,True,False,False,False,True,False
3,15.4,16.0,1.83,2537.45,2880.0,1800.0,5184000.0,2.7,512.0,False,...,False,False,False,False,True,False,True,False,False,False
4,13.3,8.0,1.37,1803.60,2560.0,1600.0,4096000.0,3.1,256.0,False,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,14.0,4.0,1.80,638.00,1920.0,1080.0,2073600.0,2.5,128.0,False,...,False,False,False,False,True,False,False,False,True,False
1299,13.3,16.0,1.30,1499.00,3200.0,1800.0,5760000.0,2.5,512.0,False,...,False,False,False,False,True,False,False,False,True,False
1300,14.0,2.0,1.50,229.00,1366.0,768.0,1049088.0,1.6,64.0,False,...,False,False,False,False,False,True,False,False,True,False
1301,15.6,6.0,2.19,764.00,1366.0,768.0,1049088.0,2.5,1.0,False,...,False,False,True,False,False,False,True,False,False,False


# Splitting Data and Fitting to model

In [13]:
# Split data into features and target variable
X = df.drop('Price_euros', axis=1)
y = df['Price_euros']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Score

In [18]:
# print("Accuracy Score: ", accuracy_score(y_test,y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("RMSE: ", mean_squared_error(y_test, y_pred))


R2 Score:  0.8289484139872731
RMSE:  86881.63763937163
