In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/mobile-price-classification/train.csv
/kaggle/input/mobile-price-classification/test.csv


## Install Library

In [2]:
!pip install scikit-learn



## import library

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score,f1_score
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold



# Load Dataset

In [4]:
train_ds = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
train_ds.head(5)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
test_ds = pd.read_csv("/kaggle/input/mobile-price-classification/test.csv")
test_ds.head(5)

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


# Pre-training Statistical Analysis

In [6]:
# Check statistics of price range
print(train_ds[["price_range"]].describe())

       price_range
count  2000.000000
mean      1.500000
std       1.118314
min       0.000000
25%       0.750000
50%       1.500000
75%       2.250000
max       3.000000


In [7]:
# Check distribution of result
print(train_ds["price_range"].value_counts())

# If the labels are imbalanced, we can use balanced_accuracy_score in sklearn_metrics

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64


## Prepare data

In [8]:
X = train_ds.loc[:,train_ds.columns != "price_range"]
y = train_ds.loc[:,train_ds.columns == "price_range"]

X_train_all, X_test,y_train_all,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
X_train, X_val, y_train,y_val = train_test_split(X_train_all,y_train_all, test_size = 0.2, random_state = 42)

print(f"X_train_all shape: {X_train_all.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print("")
print(f"y_train_all shape: {y_train_all.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_all shape: (1600, 20)
X_train shape: (1280, 20)
X_val shape: (320, 20)
X_test shape: (400, 20)

y_train_all shape: (1600, 1)
y_train shape: (1280, 1)
y_val shape: (320, 1)
y_test shape: (400, 1)


## scikit-learn Gradient Boosting Classifer (Without Cross-validation)

In [9]:
# Model Training and Fitting
sk_gbr = GradientBoostingClassifier()
sk_gbr.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [10]:
# Valiation
y_val_pred = sk_gbr.predict(X_val)

# Model performance evaluation
print(f"Accuracy Score (Validation Set): {accuracy_score(y_val,y_val_pred)}")
print(f"Weighted F1 Score (Validation Set): {f1_score(y_val,y_val_pred,average = 'weighted')}")

Accuracy Score (Validation Set): 0.90625
Weighted F1 Score (Validation Set): 0.9057706359678637


In [11]:
# Test with unseen data
y_test_pred = sk_gbr.predict(X_test)

# Model performance evaluation
print(f"Accuracy Score (Test Set): {accuracy_score(y_test,y_test_pred)}")
print(f"Weighted F1 Score (Test Set): {f1_score(y_test,y_test_pred,average = 'weighted')}")

Accuracy Score (Test Set): 0.89
Weighted F1 Score (Test Set): 0.8908449074074073


## scikit-learn Gradient Boosting Classifer (With Cross-validation)

In [12]:
# 5-fold cross validation
cv = RepeatedStratifiedKFold(n_splits = 5,n_repeats = 3, random_state = 42)
sk_gbr = GradientBoostingClassifier(learning_rate = 0.15)
scores = cross_val_score(sk_gbr, X_train_all, y_train_all.values.ravel(), cv=cv,scoring = "accuracy")
print(f"10-fold Accuracy: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")
print(f"Stdev Accuracy: {np.std(scores)}")

# Fit the model the all training data
sk_gbr.fit(X_train_all,y_train_all)

10-fold Accuracy: [0.909375 0.878125 0.925    0.90625  0.88125  0.896875 0.890625 0.890625
 0.915625 0.90625  0.9      0.9      0.884375 0.890625 0.903125]
Mean Accuracy: 0.8985416666666667
Stdev Accuracy: 0.012545057681103837


  y = column_or_1d(y, warn=True)


In [13]:
# Test with unseen data
y_test_pred = sk_gbr.predict(X_test)

# Model performance evaluation
print(f"Accuracy Score (Test Set): {accuracy_score(y_test,y_test_pred)}")
print(f"Weighted F1 Score (Test Set): {f1_score(y_test,y_test_pred,average = 'weighted')}")

Accuracy Score (Test Set): 0.9125
Weighted F1 Score (Test Set): 0.9126172248803829
