<a href="https://colab.research.google.com/github/benasphy/Linear-Regression/blob/main/Laptop%20Price%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load the Dataset
# Replace 'path_to_csv' with the actual path to the downloaded dataset
# Specifying the encoding to 'latin-1' to handle potential encoding issues
df = pd.read_csv('/content/laptop_price.csv', encoding='latin-1')

# Data Preprocessing
# Handle Missing Values
df.dropna(inplace=True)

# Encode Categorical Variables
le = LabelEncoder()
categorical_features = ['Company', 'Product', 'TypeName', 'Cpu', 'Gpu', 'OpSys']
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# ---Changes start here---
# Preprocessing for 'Memory', 'Ram' and 'Weight' Features

# 1. Extract numeric value and unit from 'Memory' column
df['Memory_Num'] = df['Memory'].str.extract('(\d+)').astype(float)  # Extract numeric part
df['Memory_Unit'] = df['Memory'].str.extract('([a-zA-Z]+)')  # Extract unit

# 2. Extract numeric value from 'Ram' column
df['Ram'] = df['Ram'].str.extract('(\d+)').astype(float) # Extract numeric part of Ram

# 3. Create dummy variables for Memory_Unit, handling missing categories
# Get all unique memory units
all_memory_units = ['GB', 'HDD', 'Hybrid', 'SSD', 'TB']

# Create dummy variables, ensuring all expected categories are present
memory_unit_dummies = pd.get_dummies(df['Memory_Unit'], prefix='Memory_Unit')
for unit in all_memory_units:
    if f'Memory_Unit_{unit}' not in memory_unit_dummies.columns:
        memory_unit_dummies[f'Memory_Unit_{unit}'] = 0  # Add missing columns with 0 values

df = pd.concat([df, memory_unit_dummies], axis=1)

# 4. Drop the original 'Memory' and 'Memory_Unit' columns
df.drop(['Memory', 'Memory_Unit'], axis=1, inplace=True)

# 5. Convert 'Weight' column to numeric (extract numeric part and convert to float)
df['Weight'] = df['Weight'].str.extract('(\d+\.?\d*)').astype(float)

# ---Changes end here---

# Feature Selection
# Include new memory features in X
X = df[['Company', 'TypeName', 'Inches', 'Cpu', 'Ram', 'Memory_Num',
        'Memory_Unit_GB', 'Memory_Unit_HDD', 'Memory_Unit_Hybrid',
        'Memory_Unit_SSD', 'Memory_Unit_TB', 'Gpu', 'OpSys', 'Weight']]
y = df['Price_euros']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R^2 Score: {r2}')

RMSE: 422.36332651454927
R^2 Score: 0.6487862498100904
