In [18]:
# !pip3 install catboost
!pip3 install xgboost
# !pip3 install lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import f_classif, chi2

# Load Dataset

In [2]:
train_data = pd.read_csv("dataset/train.csv")
train_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
test_data = pd.read_csv("dataset/train.csv")
test_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


# Data Preprocessing

## Handling Null Values

In [4]:
train_data.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

## Rename Columns

In [5]:
columns_to_rename = {
    "blue": "bluetooth", 
    "fc": "front_camera",
    "int_memory": "internal_memory",
    "m_dep": "phone_depth",
    "mobile_wt": "phone_width",
    "pc": "rear_camera_megapixel",
    "px_width": "screen_width_px",
    "px_height": "screen_height_px",
    "sc_h": "screen_width_cm",
    "sc_w": "screen_height_cm",  
}

In [6]:
numeric_features = [
    "battery_power", "clock_speed", "internal_memory", "phone_depth", "phone_width", "n_cores", "rear_camera_megapixel",
    "screen_height_px", "screen_height_cm", "talk_time"
]

In [7]:
train_data.rename(columns=columns_to_rename, inplace=True)

In [8]:
X_train = train_data.iloc[:,:-1]
X_train.head()

Unnamed: 0,battery_power,bluetooth,clock_speed,dual_sim,front_camera,four_g,internal_memory,phone_depth,phone_width,n_cores,rear_camera_megapixel,screen_height_px,screen_width_px,ram,screen_width_cm,screen_height_cm,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [9]:
y_train = train_data.iloc[:,-1]
y_train.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

In [10]:
test_data.rename(columns=columns_to_rename, inplace=True)

In [11]:
X_test = test_data.iloc[:,:-1]
X_test.head()

Unnamed: 0,battery_power,bluetooth,clock_speed,dual_sim,front_camera,four_g,internal_memory,phone_depth,phone_width,n_cores,rear_camera_megapixel,screen_height_px,screen_width_px,ram,screen_width_cm,screen_height_cm,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0


In [12]:
y_test = test_data.iloc[:,-1]
y_test.head()

0    1
1    2
2    2
3    2
4    1
Name: price_range, dtype: int64

## Feature Selection

In [13]:
numeric_df = X_train[numeric_features]
f_scores, p_values = f_classif(numeric_df, y_train)

anova_results = pd.DataFrame({"features":[], "f_scores":[], "p_values":[]})

for feat, f, p in zip(numeric_df.columns, f_scores, p_values):
    # Create a DataFrame with the current feature's information
    df_to_append = pd.DataFrame({
        "features": [feat],
        "f_scores": [f],
        "p_values": [p]
    })

    # Concatenate the new DataFrame to 'anova' and update 'anova'
    anova_results = pd.concat([anova_results, df_to_append], ignore_index=True)

anova_results

Unnamed: 0,features,f_scores,p_values
0,battery_power,31.598158,5.948688e-20
1,clock_speed,0.493708,0.6866752
2,internal_memory,2.922996,0.03277694
3,phone_depth,1.500682,0.2124595
4,phone_width,3.594318,0.01311739
5,n_cores,2.625415,0.04893585
6,rear_camera_megapixel,0.825446,0.4797489
7,screen_height_px,19.484842,1.886085e-12
8,screen_height_cm,1.671,0.1712146
9,talk_time,1.628811,0.1806686


In [14]:
non_numeric_df = X_train.drop(numeric_features, axis=1)
chi2_values, p_values = chi2(non_numeric_df, y_train)

chi2_results = pd.DataFrame({"features":[], "chi2_values":[], "p_values":[]})

for feat, c, p in zip(non_numeric_df.columns, chi2_values, p_values):
    # Create a DataFrame with the current feature's information
    df_to_append = pd.DataFrame({
        "features": [feat],
        "chi2_values": [c],
        "p_values": [p]
    })

    # Concatenate the new DataFrame to 'anova' and update 'anova'
    chi2_results = pd.concat([chi2_results, df_to_append], ignore_index=True)

chi2_results

Unnamed: 0,features,chi2_values,p_values
0,bluetooth,0.723232,0.867726
1,dual_sim,0.631011,0.889298
2,front_camera,10.135166,0.017451
3,four_g,1.521572,0.6773
4,screen_width_px,9810.58675,0.0
5,ram,931267.519053,0.0
6,screen_width_cm,9.614878,0.02214
7,three_g,0.327643,0.954748
8,touch_screen,1.928429,0.587394
9,wifi,0.422091,0.935642


In [15]:
for fa in anova_results.features:
    p = anova_results.loc[anova_results['features'] == fa, 'p_values'].values[0]
    f = anova_results.loc[anova_results['features'] == fa, 'f_scores'].values[0]

    if p < 0.05:
        continue
        
    X_train.drop(fa, axis=1, inplace=True)
    X_test.drop(fa, axis=1, inplace=True)

for fc in chi2_results.features:
    p = chi2_results.loc[chi2_results['features'] == fc, 'p_values'].values[0]

    if p < 0.05:
        continue

    X_train.drop(fc, axis=1, inplace=True)
    X_test.drop(fc, axis=1, inplace=True)

X_train

Unnamed: 0,battery_power,front_camera,internal_memory,phone_width,n_cores,screen_height_px,screen_width_px,ram,screen_width_cm
0,842,1,7,188,2,20,756,2549,9
1,1021,0,53,136,3,905,1988,2631,17
2,563,2,41,145,5,1263,1716,2603,11
3,615,0,10,131,6,1216,1786,2769,16
4,1821,13,44,141,2,1208,1212,1411,8
...,...,...,...,...,...,...,...,...,...
1995,794,0,2,106,6,1222,1890,668,13
1996,1965,0,39,187,4,915,1965,2032,11
1997,1911,1,36,108,8,868,1632,3057,9
1998,1512,4,46,145,5,336,670,869,18


In [16]:
categorical_features = [col for col in X_train.columns if col not in numeric_features]

# Fitting Model

## Model Pipeline

In [23]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [24]:
estimators = [
    ('clf', XGBClassifier(random_state=8))
]

pipe = Pipeline(steps=estimators)
pipe

In [25]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [28]:
search_space = {
    'clf__max_depth': Integer(2, 8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_byleaf': Real(0.5, 1.0),
    'clf__colsample_bynode': Real(0.5, 1.0),
}

opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=8, scoring='roc_auc', random_state=8)

In [29]:
opt.fit(X_train, y_train)

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations