# Laptop Price Prediction

In [271]:
import kagglehub
import shutil
import os
import pandas as pd



In [272]:
data = pd.read_csv('laptops.csv')

# EDA

In [273]:
#from ydata_profiling import ProfileReport

#profile = ProfileReport(data, title="Laptop Specs And Price Report", explorative=True)
#profile.to_notebook_iframe()

In [274]:
#import dtale

#dtale.show(data)

In [275]:
import plotly.express as px
import plotly.graph_objects as go

def numerical_eda(data, column_name):
    """
    Perform EDA for a numerical column, generating interactive histogram and boxplot.

    Parameters:
    - data: pd.DataFrame
        The dataset containing the column.
    - column_name: str
        The name of the numerical column to analyze.

    Returns:
    - None
    """
    if column_name in data.columns:
        # Summary Statistics
        summary = data[column_name].describe()
        print(f"Summary Statistics for {column_name}:")
        print(summary)

        # Interactive Histogram
        hist = px.histogram(
            data,
            x=column_name,
            nbins=10,
            title=f"Interactive Histogram: {column_name} Distribution",
            labels={column_name: column_name},
            color_discrete_sequence=['orange'],  # Custom color for the histogram
            marginal='box'  # Adds a boxplot above the histogram
        )
        hist.update_layout(
            xaxis=dict(title=column_name),
            yaxis=dict(title="Frequency"),
            title=dict(font=dict(size=20)),
            template="plotly_white"
        )
        hist.show()  # Display the histogram

        # Interactive Boxplot
        box = go.Figure()
        box.add_trace(
            go.Box(
                y=data[column_name],
                name=column_name,
                marker_color='lightblue',
                boxmean=True  # Displays mean as a line on the boxplot
            )
        )
        box.update_layout(
            title=f"Interactive Boxplot: {column_name} Distribution",
            yaxis=dict(title=column_name),
            xaxis=dict(title=""),
            template="plotly_white"
        )
        box.show()  # Display the boxplot
    else:
        print(f"The column '{column_name}' does not exist in the dataset.")

In [276]:
import pandas as pd
import plotly.express as px

def categorical_eda(data, column_name):
    """
    Perform EDA for a categorical column, generating interactive bar and pie charts.

    Parameters:
    - data: pd.DataFrame
        The dataset containing the column.
    - column_name: str
        The name of the categorical column to analyze.

    Returns:
    - None
    """
    if column_name in data.columns:
        # Count occurrences of each category
        category_summary = data[column_name].value_counts().reset_index()
        category_summary.columns = [column_name, 'Count']  # Rename columns for better readability

        # Print the summary statistics
        print(f"Summary of {column_name} Distribution:")
        print(category_summary)

        # Interactive Bar Chart
        bar_chart = px.bar(
            category_summary,
            x=column_name,
            y='Count',
            title=f"Interactive Bar Chart: {column_name} Distribution",
            labels={column_name: column_name, 'Count': 'Count'},
            text='Count',  # Display count values on the bars
            color=column_name,  # Different colors for each category
        )
        bar_chart.update_traces(textposition='outside')
        bar_chart.update_layout(
            xaxis=dict(title=column_name),
            yaxis=dict(title="Count"),
            title=dict(font=dict(size=20)),
            template="plotly_white"
        )
        bar_chart.show()  # Display the bar chart

        # Interactive Pie Chart
        pie_chart = px.pie(
            category_summary,
            names=column_name,
            values='Count',
            title=f"Interactive Pie Chart: {column_name} Distribution",
            color_discrete_sequence=px.colors.sequential.RdBu,  # Custom color scheme
        )
        pie_chart.update_traces(textinfo='percent+label')
        pie_chart.update_layout(
            title=dict(font=dict(size=20)),
            template="plotly_white"
        )
        pie_chart.show()  # Display the pie chart
    else:
        print(f"The column '{column_name}' does not exist in the dataset.")

In [None]:
import plotly.express as px

# Perform EDA on the 'brand' column
brand_summary = data['brand'].value_counts().reset_index()
brand_summary.columns = ['Brand', 'Count']  # Rename columns for better readability

# Create an interactive bar chart using Plotly
brand_bar_chart = px.bar(
    brand_summary,
    x='Brand',
    y='Count',
    title="Interactive Bar Chart: Brand Distribution",
    labels={'Brand': 'Brand', 'Count': 'Count'},
    text='Count'  # Display count values on the bars
)

# Customize the layout for better visualization
brand_bar_chart.update_traces(textposition='outside')
brand_bar_chart.update_layout(
    xaxis=dict(title="Brand"),
    yaxis=dict(title="Count"),
    title=dict(font=dict(size=20)),
    template="plotly_white"
)

# Show the chart directly
brand_bar_chart.show()

In [None]:
numerical_eda(data, 'Price')

In [None]:
numerical_eda(data, 'Rating')

In [None]:
categorical_eda(data, 'processor_brand')

In [None]:
categorical_eda(data, 'processor_tier')

In [None]:
numerical_eda(data, 'num_cores')

In [None]:
numerical_eda(data, 'ram_memory')

In [None]:
numerical_eda(data, 'primary_storage_capacity')

In [None]:
numerical_eda(data, 'secondary_storage_capacity')

In [None]:
categorical_eda(data, 'gpu_type')

In [None]:
categorical_eda(data, 'OS')

# Data Preprocessing

In [None]:
del data['index']

data.head(5)

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.set_index(['Model'], inplace=True)

data.head()

In [None]:
# Define the tier mapping
processor_tier_mapping = {
    'core i3': 'Entry',
    'core i5': 'Mid',
    'core i7': 'High',
    'core i9': 'Ultra High',
    'core ultra 7': 'Ultra High',
    'ryzen 3': 'Entry',
    'ryzen 5': 'Mid',
    'ryzen 7': 'High',
    'ryzen 9': 'Ultra High',
    'celeron': 'Entry',
    'pentium': 'Entry',
    'm1': 'High',
    'm2': 'High',
    'm3': 'Mid',
    'other' : 'unknown'
}
# Map the processor column to its corresponding tier
data['processor_tier'] = data['processor_tier'].map(processor_tier_mapping)

data.head()

In [None]:
# Replace categorical values with numerical mappings
data.replace(['unknown', 'Entry', 'Mid', 'High', 'Ultra High'], [0, 1, 2, 3, 4], inplace=True)
data.replace(['HDD', 'SSD'], [1, 2], inplace=True)
data.replace(['No secondary storage'], [0], inplace=True)

data.head()

In [None]:
data['secondary_storage_type'].value_counts()
data.replace(['No secondary storage'],[0],inplace=True)

data.head(5)

In [295]:
# convert true/false to 1/0
data['is_touch_screen'] = data['is_touch_screen'].astype(int)

In [296]:
# Perform one-hot encoding for specified columns
data = pd.get_dummies(data, columns=['brand', 'processor_brand', 'gpu_brand', 'gpu_type', 'OS'], dtype=int)

In [297]:
# change No information to 0 in year_of_warranty, and change to int
data['year_of_warranty'] = data['year_of_warranty'].replace('No information', 0).astype(int)

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

In [300]:
data.to_csv('laptops_cleaned.csv')

In [None]:
# devide the data into features and target
X = data.drop(columns='Price')
y = data['Price']
X.head()

In [302]:
#perfomr scaling only to the features 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [305]:
X_scaled.to_csv('laptops_scaled.csv')

# Modeling

In [266]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import shap
from lime.lime_tabular import LimeTabularExplainer
import numpy as np

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Expanded parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],  # L1 regularization
    'reg_lambda': [1, 1.5, 2]  # L2 regularization
}

# Initialize XGBoost regressor
xgb_model = XGBRegressor(random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,  # Increased to 5 folds for more robust evaluation
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Extract the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test_scaled)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print("Best Parameters:", best_params)
print("Best RMSE:", rmse_best)
print("Best R²:", r2_best)

# Interpret the model with SHAP
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_scaled)

# Visualize SHAP explanations
shap.summary_plot(shap_values, X_test_scaled)

# Visualize SHAP for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0], X_test_scaled[0])

# Initialize LIME explainer
lime_explainer = LimeTabularExplainer(
    X_train_scaled,
    training_labels=y_train.to_numpy(),
    feature_names=X.columns.tolist(),
    class_names=['Price'],  # Target variable
    mode='regression'
)

# Explain a single prediction with LIME
exp = lime_explainer.explain_instance(
    X_test_scaled[0],  
    best_model.predict,  
    num_features=10
)

# Display the explanation
exp.show_in_notebook()

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import shap
from lime.lime_tabular import LimeTabularExplainer

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Parameter grid for LightGBM
param_grid = {
    'n_estimators': [100],
    'max_depth': [-1],  # -1 indicates no limit
    'learning_rate': [0.01],
    'num_leaves': [30],
    'min_data_in_leaf': [20],
    'feature_fraction': [0.8],  # Subsampling ratio of features
    'bagging_fraction': [0.8],  # Subsampling ratio of rows
    'bagging_freq': [0]  # Frequency of bagging
}

# Initialize LightGBM regressor
lgb_model = lgb.LGBMRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Extract the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test_scaled)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print("Best Parameters:", best_params)
print("Best RMSE:", rmse_best)
print("Best R²:", r2_best)

# Interpret the model with SHAP
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_scaled)

# Visualize SHAP explanations
shap.summary_plot(shap_values, X_test_scaled)

# Visualize SHAP for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0], X_test_scaled[0])

# Initialize LIME explainer
lime_explainer = LimeTabularExplainer(
    X_train_scaled,
    training_labels=y_train.to_numpy(),
    feature_names=X.columns.tolist(),
    class_names=['Price'],  # Target variable
    mode='regression'
)

# Explain a single prediction with LIME
exp = lime_explainer.explain_instance(
    X_test_scaled[0],  
    best_model.predict,  
    num_features=10
)

# Display the explanation
exp.show_in_notebook()

In [None]:
import joblib

# Assuming xgboost_model and lgbm_model are your trained models
joblib.dump(xgb_model, 'xgb_model.pkl')
joblib.dump(lgb_model, 'lgb_model.pkl')
