# **Backpack Price Prediction**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

##  **Import Libraries**

We begin by importing the necessary libraries for data manipulation, visualization, and statistical analysis.

In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
import math
from IPython.display import display  
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# **Load the training csv dataset**
We load the CSV file into a Pandas DataFrame to traning

In [None]:
#Loading the Dataset
train_data = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
extra_data = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

# **Overview Dataset**
The dataset includes the following features:


* Brand: The brand of the bag (Nike,Under Armour).
* Material: The primary material used (Leather,Polyester,Canvas).
* Size: The size of the bag (Small,Medium,Large).
* Compartments: The number of compartments in the bag (pockets).
* Laptop Compartment: laptop compartment availablity (Yes/No).
* Waterproof: waterproof (Yes/No).
* Style: The style of the bag (Backpack, Messenger, Tote).
* Color: The color of the bag.
* Weight Capacity (kg): The maximum weight capalcity.
* Price: The price of the bag in USD.

In [None]:
# Combine train and train_extra
data = pd.concat([train_data, extra_data], ignore_index=True)

In [None]:
data.info()

**Note - null values also available**

In [None]:
data.head()

In [None]:
# Drop 'id' column as it is not useful for training
data.drop(columns=["id"], inplace=True)

In [None]:
data.info()

#  **Checking Duplicate Rows** 
check and remove duplicate rows in the dataset.

In [None]:
duplicated_rows = data.duplicated()
sum(duplicated_rows)

# **Checking Missing Values and filling the records**
Handles missing values efficiently. Ensures numerical consistency (integer vs. float). Encodes categorical data for compatibility with ML models.

In [None]:
 #Checkung
data.isna().sum()

In [None]:
#Filling and conerting the values

#Convert "No and Null" to 0 and "Yes" to 1 (Binary Encoding)
data['Laptop Compartment'] = data['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)
data['Waterproof'] = data['Waterproof'].apply(lambda x: 1 if  x == "Yes" else 0)

#Convert Decimal (float) values to Integers
data['Compartments'] = data['Compartments'].astype(int)

#Fill categories columns Null values with 'Unknown'
categories = ["Brand", "Material", "Size", "Style", "Color"]
for col in categories:
    if col in data.columns:  # Check if column exists
        data[col] = data[col].fillna("Unknown")

#Fill Weight Capacity (kg) missing values with median
data["Weight Capacity (kg)"] = data["Weight Capacity (kg)"].fillna(data["Weight Capacity (kg)"].median())

data.head()

# **Data Visualization in Plot** 
Graphical representation of data to help identify patterns, trends, and insights more easily

In [None]:
def plot_feature_distributions(data, target='Price', n_cols=3, categorical_override=['Compartments']):
    # Define colors
    color = "#72bcd4"  # Light blue for numerical features

    # Separate features
    features = [col for col in data.columns if col != target]
    
    # Calculate number of rows needed
    n_rows = int(np.ceil((len(features) + 1) / n_cols))  # +1 for target histogram
    
    # Create subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 6, n_rows * 5))
    axes = axes.flatten()
    
    # Plot histogram for the target variable
    sns.histplot(data=data, x=target, kde=True, ax=axes[0], color=color)
    axes[0].set_title(f"Distribution of {target}", fontsize=14, fontweight='bold')
    axes[0].set_xlabel(target, fontsize=12)
    axes[0].set_ylabel("Count", fontsize=12)
    axes[0].grid(True, linestyle='--', alpha=0.7)
    
    # Plot features
    for idx, col in enumerate(features, start=1):
        is_categorical = col in categorical_override or data[col].dtype not in ['int64', 'float64']

        if is_categorical:
            # Get category counts
            value_counts = data[col].value_counts()

            # Ensure consistent order for binary Yes/No values
            if set(value_counts.index) == {1, 0}:  # Numeric binary
                value_counts = value_counts.reindex([1, 0], fill_value=0)
            elif set(value_counts.index) == {"Yes", "No"}:  # Categorical binary
                value_counts = value_counts.reindex(["Yes", "No"], fill_value=0)

            # Bar plot for categorical features
            sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[idx], color=color)

            axes[idx].set_title(f"Distribution of {col}", fontsize=14, fontweight='bold')
            axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=0, ha='center', fontsize=12)
            axes[idx].set_xlim(-0.5, len(value_counts) - 0.5)

        else:
            # Histogram for numerical features
            sns.histplot(data=data, x=col, kde=True, ax=axes[idx], color=color)
            axes[idx].set_title(f"Distribution of {col}", fontsize=14, fontweight='bold')
            axes[idx].grid(True, linestyle='--', alpha=0.7)

        # Apply consistent labels
        axes[idx].set_xlabel(col, fontsize=12)
        axes[idx].set_ylabel("Count", fontsize=12)

    # Remove extra empty plots
    for i in range(len(features) + 1, len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.show()

# Usage
plot_feature_distributions(data)


## **Correlation Heatmap**
One-hot encoding was applied to categorical features, converting them into numerical format. A correlation heatmap was then generated to analyze relationships between features. This helps identify key associations and potential multicollinearity, aiding in better feature selection and data interpretation.

In [None]:
# Encoding for categorical features
encoded_data = pd.get_dummies(data, drop_first=False)

# Calculate 
correlation_data = encoded_data.corr()

# Plot the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_data, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size':8}, cmap='Blues')
plt.title('Correlation Heatmap', fontsize=16)
plt.show()


# Detect Outliers
detection is the process of identifying data points that significantly differ from the rest of the dataset

In [None]:
numeric_features = data.select_dtypes(include=['number']).columns

# Plot boxplots for numeric features
data[numeric_features].plot(kind='box', subplots=True, layout=(5, 6), figsize=(12, 10))
plt.show()

# **Statistical Summary**
A statistical summary provides insights into the distribution and characteristics of numerical and categorical data in a dataset.

In [None]:
#statistical information of numerical values

numerical_features = data.select_dtypes(include=[np.number])
data.describe(include=[np.number]).transpose()

In [None]:
#statistical information of categorical values

categorial_features = data.select_dtypes(include=object)
data.describe(include=object)

In [None]:
for col in categories:
    # Calculate the average price for each category
    avg_price = data.groupby(col)["Price"].mean().sort_values(ascending=False)
    
    # Plot the bar plot for each categorical column
    plt.figure(figsize=(6, 4))
    avg_price.plot(kind="bar", color='skyblue')
    plt.title(f"Average Price by {col}", fontsize=14)
    plt.xticks(rotation=90)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Average Price', fontsize=12)
    plt.show()


In [None]:
# List of non-categorical columns
non_categorical_cols = ["Compartments", "Laptop Compartment", "Waterproof", "Weight Capacity (kg)"]

# For each non-categorical feature, we will visualize its relationship with Price
for col in non_categorical_cols:
    plt.figure(figsize=(8, 6))
    
    if data[col].dtype in ['int64', 'float64']:  # If the column is numeric
        # Scatter plot for continuous variables
        sns.scatterplot(x=data[col], y=data['Price'], color='skyblue')
        plt.title(f"Price vs {col}", fontsize=14)
        plt.xlabel(col, fontsize=12)
        plt.ylabel("Price", fontsize=12)

    else:
        # Box plot for binary/categorical features
        sns.boxplot(x=data[col], y=data['Price'], color='skyblue')
        plt.title(f"Price distribution by {col}", fontsize=14)
        plt.xlabel(col, fontsize=12)
        plt.ylabel("Price", fontsize=12)
    
    plt.show()


In [None]:
# Calculate correlation between numerical features and 'Price'
correlations = data[non_categorical_cols + ['Price']].corr()

# Print correlation with Price
print(correlations['Price'].sort_values(ascending=False))


In [None]:
numerical_features = data.select_dtypes(include=[np.number]).columns
exclude_columns = ['Compartments']
numerical_features = [col for col in numerical_features if col not in exclude_columns]

# Calculate skewness for each numerical column
skew_newfeatures = data[numerical_features].skew().sort_values(ascending=False)

# Set skewness threshold
skew_limit = 0.75

# Identify numerical columns with unique values 0 and 1
binary_cols = [col for col in numerical_features if data[col].nunique() == 2]

# Filter out binary columns and apply skewness threshold
skew_cols = (
    skew_newfeatures
    .drop(index=binary_cols)  # Exclude binary columns
    .to_frame(name='Skew')    # Convert to DataFrame and rename the column to 'Skew'
    .query('abs(Skew) > @skew_limit')  # Filter for skewness beyond the limit
)

print(skew_cols)

# Encoding Categorical Columns 
this technique used for converting categorical data into numerical values, specifically for ordinal categorical variables. Ordinal categorical data refers to categories that have a natural order or ranking (e.g., "Low", "Medium", "High"), but the distance between the categories is not necessarily uniform or measurable.

In [None]:
data  = pd.get_dummies(data)
data = data.astype(int)
data.head()


# Split the dataset into training and validation sets

In [None]:
# Define features (X) and target variable (y)
X = data.drop(columns=['Price'])  # Features (excluding Price)
y = data['Price']  

# Standardize all numerical features
scaler = StandardScaler()
X_scaled = X.copy()  # Make a copy to avoid modifying the original data
X_scaled.iloc[:, :] = scaler.fit_transform(X)  # Apply scaling to all features

# Split into training and validation sets (80% train, 20% validation) with fixed random_state
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

# **Testing Models**
Testing different models helps find the best one for your data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "DecisionTreeRegressor":DecisionTreeRegressor()
}

# Train and evaluate each model
results = []

for Modelname, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)

    resVal = {
        "Model": Modelname,
        "RMSE": rmse,
        "MAE": mae,
        "R² Score": r2
    }
    print(resVal)

    results.append(resVal)

# Display results
import pandas as pd
results_df = pd.DataFrame(results).sort_values(by="RMSE")
print(results_df)

**Analysis of Metrics:**

* RMSE (Root Mean Squared Error) → Lower is better
* MAE (Mean Absolute Error) → Lower is better
* R² Score → Closer to 1 is better




# Gradient Boosting ✅ (Best solution)


# **Create Model**

In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define a function to train & save the model
def model_accc():
    # Train Gradient Boosting
    gbr_model = GradientBoostingRegressor(random_state=42)
    gbr_model.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(gbr_model, "gradient_boosting_model.pkl")  # Correct variable name
    print("Model saved successfully!")

# Call the function
model_accc()

In [None]:

print(X_train.shape)  # Example: (1000, 5) → 1000 samples, 5 features (inputs)
print(X_train.columns)  # Displays feature names


In [None]:
# Load the saved model
loaded_model = joblib.load("gradient_boosting_model.pkl")

# Define column names (same as X_train)
feature_names = ['Compartments', 'Laptop Compartment', 'Waterproof',
       'Weight Capacity (kg)', 'Brand_Adidas', 'Brand_Jansport', 'Brand_Nike',
       'Brand_Puma', 'Brand_Under Armour', 'Brand_Unknown', 'Material_Canvas',
       'Material_Leather', 'Material_Nylon', 'Material_Polyester',
       'Material_Unknown', 'Size_Large', 'Size_Medium', 'Size_Small',
       'Size_Unknown', 'Style_Backpack', 'Style_Messenger', 'Style_Tote',
       'Style_Unknown', 'Color_Black', 'Color_Blue', 'Color_Gray',
       'Color_Green', 'Color_Pink', 'Color_Red', 'Color_Unknown']

# Create a DataFrame for a new sample
new_data_df = pd.DataFrame([[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]], columns=feature_names)

# Make a prediction
predicted_price = loaded_model.predict(new_data_df)
print("Predicted Backpack Price:", predicted_price[0])

# Submit

In [None]:
submission = pd.DataFrame({'id': new_data_df.index, 'Price': predicted_price})
submission.to_csv('submission.csv', index=False)
display(submission)