In [1]:
# !pip3 install catboost
# !pip3 install xgboost
# !pip3 install lightgbm

# Load Dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("dataset/train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


# Data Preprocessing

## Handling Null Values

In [4]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

## Rename Columns

In [5]:
columns_to_rename = {
    "blue": "bluetooth", 
    "fc": "front_camera",
    "int_memory": "internal_memory",
    "m_dep": "phone_depth",
    "mobile_wt": "phone_width",
    "pc": "rear_camera_megapixel",
    "px_width": "screen_width_px",
    "px_height": "screen_height_px",
    "sc_h": "screen_width_cm",
    "sc_w": "screen_height_cm",  
}

In [6]:
numeric_features = [
    "battery_power", "clock_speed", "internal_memory", "phone_depth", "phone_width", "n_cores", "rear_camera_megapixel",
    "screen_height_px", "screen_height_cm", "talk_time"
]

In [7]:
df.rename(columns=columns_to_rename, inplace=True)

In [8]:
X = df.drop(columns="price_range")
y = df["price_range"]

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

## Feature Selection

In [11]:
from sklearn.feature_selection import f_classif, chi2

In [12]:
numeric_df = X_train[numeric_features]
f_scores, p_values = f_classif(numeric_df, y_train)

anova_results = pd.DataFrame({"features":[], "f_scores":[], "p_values":[]})

for feat, f, p in zip(numeric_df.columns, f_scores, p_values):
    # Create a DataFrame with the current feature's information
    df_to_append = pd.DataFrame({
        "features": [feat],
        "f_scores": [f],
        "p_values": [p]
    })

    # Concatenate the new DataFrame to 'anova' and update 'anova'
    anova_results = pd.concat([anova_results, df_to_append], ignore_index=True)

anova_results

Unnamed: 0,features,f_scores,p_values
0,battery_power,28.233813,8.682835e-18
1,clock_speed,0.54288,0.6529857
2,internal_memory,3.454423,0.01593761
3,phone_depth,2.257856,0.07990612
4,phone_width,2.250818,0.08065175
5,n_cores,1.440308,0.2292598
6,rear_camera_megapixel,0.623176,0.6000317
7,screen_height_px,15.063645,1.138099e-09
8,screen_height_cm,1.329323,0.2631707
9,talk_time,1.893764,0.1286509


In [13]:
non_numeric_df = X_train.drop(numeric_features, axis=1)
chi2_values, p_values = chi2(non_numeric_df, y_train)

chi2_results = pd.DataFrame({"features":[], "chi2_values":[], "p_values":[]})

for feat, c, p in zip(non_numeric_df.columns, chi2_values, p_values):
    # Create a DataFrame with the current feature's information
    df_to_append = pd.DataFrame({
        "features": [feat],
        "chi2_values": [c],
        "p_values": [p]
    })

    # Concatenate the new DataFrame to 'anova' and update 'anova'
    chi2_results = pd.concat([chi2_results, df_to_append], ignore_index=True)

chi2_results

Unnamed: 0,features,chi2_values,p_values
0,bluetooth,0.080318,0.99409
1,dual_sim,0.207032,0.976446
2,front_camera,6.855454,0.07665
3,four_g,0.652747,0.884258
4,screen_width_px,8777.986305,0.0
5,ram,748840.816315,0.0
6,screen_width_cm,4.603148,0.203272
7,three_g,0.456417,0.928357
8,touch_screen,1.51608,0.678564
9,wifi,1.385444,0.70895


In [14]:
for fa in anova_results.features:
    p = anova_results.loc[anova_results['features'] == fa, 'p_values'].values[0]
    f = anova_results.loc[anova_results['features'] == fa, 'f_scores'].values[0]

    if p < 0.05:
        continue
        
    X_train.drop(fa, axis=1, inplace=True)
    X_test.drop(fa, axis=1, inplace=True)

for fc in chi2_results.features:
    p = chi2_results.loc[chi2_results['features'] == fc, 'p_values'].values[0]

    if p < 0.05:
        continue

    X_train.drop(fc, axis=1, inplace=True)
    X_test.drop(fc, axis=1, inplace=True)

X_train

Unnamed: 0,battery_power,internal_memory,screen_height_px,screen_width_px,ram
46,1723,42,202,1791,3587
450,712,35,558,1208,2190
1942,1279,7,560,1633,1150
922,874,8,442,1248,582
461,1512,6,205,884,2335
...,...,...,...,...,...
1345,947,51,193,887,2915
581,1512,18,1079,1897,3607
121,772,10,1242,1712,3242
1262,1433,27,1619,1651,3900


In [15]:
categorical_features = [col for col in X_train.columns if col not in numeric_features]

# Fitting Model

## Model Pipeline

In [16]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [17]:
estimators = [
    ('clf', XGBClassifier(random_state=8))
]

pipe = Pipeline(steps=estimators)
pipe

## Hyperparameter Tuning and Training

In [18]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [27]:
search_space = {
    'clf__max_depth': Integer(2, 8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode': Real(0.5, 1.0)
}

opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=12, random_state=8)

In [28]:
opt.fit(X_train, y_train)

In [29]:
opt.best_score_

0.8975049949289465

In [30]:
opt.best_estimator_

# Model Evaluation

In [31]:
 from sklearn.metrics import roc_auc_score  
 from sklearn.metrics import roc_curve,auc  
 from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [32]:
y_pred = opt.predict(X_test)

In [33]:
log_reg_accuracy = accuracy_score(y_pred, y_test)  
print(f"Model accuracy: {log_reg_accuracy * 100}%")

Model accuracy: 90.5%


In [34]:
 print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.96      0.95      0.95        94
           1       0.89      0.88      0.88        97
           2       0.85      0.88      0.86       105
           3       0.93      0.92      0.93       104

    accuracy                           0.91       400
   macro avg       0.91      0.91      0.91       400
weighted avg       0.91      0.91      0.91       400

