<a href="https://www.kaggle.com/code/anthonynam/mobile-price-prediction-sklearn-gradient-boosting?scriptVersionId=140360955" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Objective of this Jupyter Notebook
1. Apply gradient boosting on a simple dataset  
2. Then try to clean it and practice a bit on how to clean a simple dataset

# Install Library

In [1]:
!pip install scikit-learn



In [2]:
!pip install numpy==1.22.4

Collecting numpy==1.22.4
  Downloading numpy-1.22.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.6 which is incompatible.
chex 0.1.81 requires numpy>=1.25.0, but you have numpy 1.22.4 which is incompatible.
momepy 0.6.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.22.4 which is incompatible.
pymc3

# Library Installation

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score,f1_score
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,GridSearchCV



# Loading Dataset

In [4]:
ds = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
ds.head(5)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


# Split Data into training, validation and testing set

In [5]:
X = ds.loc[:,ds.columns != "price_range"]
y = ds.loc[:,ds.columns == "price_range"]

X_train_all, X_test,y_train_all,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
X_train, X_val, y_train,y_val = train_test_split(X_train_all,y_train_all, test_size = 0.2, random_state = 42)

print(f"X_train_all shape: {X_train_all.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print("")
print(f"y_train_all shape: {y_train_all.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_all shape: (1600, 20)
X_train shape: (1280, 20)
X_val shape: (320, 20)
X_test shape: (400, 20)

y_train_all shape: (1600, 1)
y_train shape: (1280, 1)
y_val shape: (320, 1)
y_test shape: (400, 1)


# Apply scikit-learn Gradient Boosting Classifier (Without Cross-validation)

In [6]:
# Model Training and Fitting
sk_gbc = GradientBoostingClassifier()
sk_gbc.fit(X_train,y_train.values.ravel())

In [7]:
# Valiation
y_val_pred = sk_gbc.predict(X_val)

# Model performance evaluation
print(f"Accuracy Score (Validation Set): {accuracy_score(y_val,y_val_pred)}")
print(f"Weighted F1 Score (Validation Set): {f1_score(y_val,y_val_pred,average = 'weighted')}")

Accuracy Score (Validation Set): 0.90625
Weighted F1 Score (Validation Set): 0.9057706359678637


In [8]:
# Test with unseen data
y_test_pred = sk_gbc.predict(X_test)

# Model performance evaluation
print(f"Accuracy Score (Test Set): {accuracy_score(y_test,y_test_pred)}")
print(f"Weighted F1 Score (Test Set): {f1_score(y_test,y_test_pred,average = 'weighted')}")

Accuracy Score (Test Set): 0.89
Weighted F1 Score (Test Set): 0.8908449074074073


# Apply scikit-learn Gradient Boosting Classifier (With Cross-validation)

In [9]:
# 5-fold cross validation
cv = RepeatedStratifiedKFold(n_splits = 5,n_repeats = 3, random_state = 42)
sk_gbc = GradientBoostingClassifier(learning_rate = 0.15)
scores = cross_val_score(sk_gbc, X_train_all, y_train_all.values.ravel(), cv=cv,scoring = "accuracy")
print(f"10-fold Accuracy: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")
print(f"Stdev Accuracy: {np.std(scores)}")

# Fit the model the all training data
sk_gbc.fit(X_train_all,y_train_all.values.ravel())

10-fold Accuracy: [0.909375 0.878125 0.925    0.903125 0.88125  0.89375  0.890625 0.890625
 0.915625 0.90625  0.9      0.9      0.884375 0.89375  0.90625 ]
Mean Accuracy: 0.8985416666666668
Stdev Accuracy: 0.012493053625470794


In [10]:
# Test with unseen data
y_test_pred = sk_gbc.predict(X_test)

# Model performance evaluation
print(f"Accuracy Score (Test Set): {accuracy_score(y_test,y_test_pred)}")
print(f"Weighted F1 Score (Test Set): {f1_score(y_test,y_test_pred,average = 'weighted')}")

Accuracy Score (Test Set): 0.9125
Weighted F1 Score (Test Set): 0.9126172248803829


# Apply scikit-learn Gradient Boosting Classifier (With Grid Search + Cross Validation)

In [11]:
# parameters 
parameters = {'n_estimators':[10,20,50,100,150,200], 'learning_rate':[0.01,0.02,0.5,0.1,0.2]}

# 5-fold cross validation
sk_gbc = GradientBoostingClassifier()
sk_gbc = GridSearchCV(estimator = sk_gbc,param_grid = parameters,cv = 10,verbose = 1,n_jobs = -1)

# Fit the model the all training data
sk_gbc.fit(X_train_all,y_train_all.values.ravel())

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [12]:
sk_gbc.cv_results_

{'mean_fit_time': array([0.32561264, 0.64453959, 1.69156122, 3.19923525, 4.99381893,
        6.38003163, 0.32358923, 0.6517658 , 1.59159343, 3.41453457,
        4.78895161, 6.44618526, 0.32278435, 0.77516761, 1.60216277,
        3.19639001, 4.8242173 , 6.56721406, 0.32589643, 0.64065893,
        1.61655126, 3.21342161, 5.01902506, 6.37441664, 0.32168984,
        0.65113189, 1.58903453, 3.3791012 , 4.74757113, 6.12363615]),
 'std_fit_time': array([0.00550575, 0.00703472, 0.08792968, 0.02796037, 0.17553204,
        0.02595027, 0.00257632, 0.02698492, 0.00967089, 0.23339721,
        0.03025674, 0.12163097, 0.00207786, 0.09554295, 0.03581871,
        0.02987205, 0.03046494, 0.24095859, 0.00202248, 0.00502493,
        0.03078551, 0.03675885, 0.24712564, 0.04368782, 0.00243679,
        0.01221161, 0.00668817, 0.28434571, 0.03444188, 0.35457076]),
 'mean_score_time': array([0.00427625, 0.00439594, 0.00490437, 0.00594091, 0.0080245 ,
        0.00811381, 0.00421588, 0.00442777, 0.00501819, 0.00

In [13]:
print(sk_gbc.best_estimator_)
print(sk_gbc.best_score_)
print(sk_gbc.best_params_)

GradientBoostingClassifier(learning_rate=0.5, n_estimators=150)
0.905
{'learning_rate': 0.5, 'n_estimators': 150}


In [14]:
# Test with unseen data
y_test_pred = sk_gbc.predict(X_test)

# Model performance evaluation
print(f"Accuracy Score (Test Set): {accuracy_score(y_test,y_test_pred)}")
print(f"Weighted F1 Score (Test Set): {f1_score(y_test,y_test_pred,average = 'weighted')}")


Accuracy Score (Test Set): 0.9075
Weighted F1 Score (Test Set): 0.9076593373731532


In [15]:
y_test_proba = sk_gbc.predict_log_proba(X_test)
# print(y_test_proba)

  return np.log(proba)


In [16]:
### Conclusion, performance around 90%

# Try to Clean Data and obtain higher accuracy

In [17]:
clean_ds = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
clean_ds.head(5)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
