### Import all the required libraries

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#### Load the processed data

In [2]:
df = pd.read_csv('../data/processed_data.csv')
print(f'Data loaded for modeling. Shape: {df.shape}')


Data loaded for modeling. Shape: (9318, 19)


#### Feature Preparation & Scaling

In [3]:
numerical_features = ['Price', 'Rating', 'Review_Count', 'Is_Available']

categorical_features = ['Brand', 'Storage_Capacity']
TOP_N_BRANDS = 15

for col in categorical_features:
    top_values = df[col].value_counts().nlargest(TOP_N_BRANDS).index
    df[f'{col}_Grouped'] = np.where(df[col].isin(top_values), df[col], 'Other')

features_to_encode = [f'{col}_Grouped' for col in categorical_features]
df_final = pd.get_dummies(df, columns=features_to_encode, drop_first=True)

# Identify all features now in the model
all_model_features = numerical_features + [col for col in df_final.columns if 'Grouped_' in col]
X = df_final[all_model_features].copy()

# DIAGNOSIS CODE
nan_check = X.isnull().sum()
nan_check = nan_check[nan_check > 0]
print("\n--- Columns with NaN values (before Imputation) ---")
print(nan_check)

# Fix the nans by imputing with the mean of the column
for col in ['Rating', 'Review_Count']:
    if col in X.columns:
        # calculate median only
        median_val = X[col].median()
        # Impute checking value
        X[col] = X[col].fillna(median_val)
        print(f"Imputed NaN values in {col} with median: {median_val:.2f}")

# --- Final Check (should show 0 missing values) ---    
print("\n--- Final NaN Check ---")
print(X.isnull().sum().sum())

# Scale the Features (StandardScaler)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Save the scaler object
joblib.dump(scaler, '../model/scaler.joblib')
print("\nScaler saved successfully")



--- Columns with NaN values (before Imputation) ---
Series([], dtype: int64)
Imputed NaN values in Rating with median: 4.10
Imputed NaN values in Review_Count with median: 1112.00

--- Final NaN Check ---
0

Scaler saved successfully


#### Model A:- K-Means clustering

In [4]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)

# fit k-menas
df_final['Cluster_ID'] = kmeans.fit_predict(X_scaled)

print("\n--- K-Menas Clustering Distribution (K=5)---")
print(df_final['Cluster_ID'].value_counts())

# Save train model on the dataset
joblib.dump(kmeans, '../model/clustering_model.joblib')
print("\nClustering model saved successfully")


--- K-Menas Clustering Distribution (K=5)---
Cluster_ID
0    6858
1    1842
4     412
2     111
3      95
Name: count, dtype: int64

Clustering model saved successfully


#### Model B: Random Forest Classifier

In [5]:
# target var
y = df_final['Is_High_Performer']
X_classify = X_scaled_df #Same scaled
# split data
X_tarin, X_test, y_train, y_test = train_test_split(
    X_classify, y, test_size=0.3, random_state=42, stratify=y
    ) 

# Initalize and train classifier
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=44, class_weight='balanced')
rf_classifier.fit(X_tarin, y_train)

# Evaluate the model
y_pred = rf_classifier.predict(X_test)

print("\n--- Classification Model Evaluation ---")
print(f"Accuarcy: {accuracy_score(y_test, y_pred):.2f}")

# Save the trained Classification model
joblib.dump(rf_classifier, '../model/classification_model.joblib')
print("\nClassification model saved to model folder")




--- Classification Model Evaluation ---
Accuarcy: 1.00

Classification model saved to model folder


In [6]:
df_final.to_csv('../data/final_processed_data_with_cluster.csv', index=False)