In [37]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
sns.set_style("dark")
sns.set(rc={'figure.figsize':(12,8)})
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn import linear_model, naive_bayes, neighbors, svm, tree, ensemble
from xgboost import XGBClassifier
from sklearn.model_selection import ShuffleSplit, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [16]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [17]:
df.loc[df['price_range'].isin([1, 2, 3]), 'price_range'] = 4
df['price_range'].unique()

array([4, 0], dtype=int64)

In [4]:
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Converts categorical variables to their appropriate type
    Arguments:
        df is a n-by-d pandas data frame
    Returns:
        the final dataframe with corrected categorical data types
    '''

    df_copy = df.copy(deep=True)
    df_copy['blue'] = df.blue.astype('category')
    df_copy['dual_sim'] = df.dual_sim.astype('category')
    df_copy['four_g'] = df.four_g.astype('category')
    df_copy['three_g'] = df.three_g.astype('category')
    df_copy['touch_screen'] = df.touch_screen.astype('category')
    df_copy['wifi'] = df.wifi.astype('category')
    return df_copy

In [76]:
X_train.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'ram', 'talk_time',
       'three_g', 'touch_screen', 'wifi', 'fc_sqrt', 'px_whole_sqrt',
       'sc_whole_sqrt'],
      dtype='object')

In [6]:
px_whole = df['px_width'] * df['px_height']
sc_whole = df['sc_w'] * df['sc_h']

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Extract features form the original dataset
    Arguments:
        df is a n-by-d pandas data frame
    Returns:
        the final dataframe with extracted features
    '''

    df_copy = df.copy(deep=True)
    df_copy['px_whole'] = px_whole
    df_copy['sc_whole'] = sc_whole
    df_copy.drop(['px_width', 'px_height', 'sc_h', 'sc_w'], axis=1, inplace=True)
    return df_copy

In [7]:
def handle_skewed_distributions(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Apply square root to the skewed variables
    Arguments:
        df is a n-by-d pandas data frame
    Returns:
        the final dataframe with corrected skewed features
    '''
    
    df_copy = df.copy(deep=True)
    df_copy['fc_sqrt'] = np.sqrt(df['fc'])
    df_copy['px_whole_sqrt'] = np.sqrt(df['px_whole'])
    df_copy['sc_whole_sqrt'] = np.sqrt(df['sc_whole'])
    df_copy.drop(['fc', 'px_whole', 'sc_whole'], inplace=True, axis=1)
    return df_copy

In [18]:
df_X = df.drop('price_range', axis=1)
df_y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y,test_size=0.2, stratify=df_y, random_state=42)
print("Train size: ", X_train.shape, y_train.shape)
print("Test size: ", X_test.shape, y_test.shape)

Train size:  (1600, 20) (1600,)
Test size:  (400, 20) (400,)


In [19]:
X_train = add_features(X_train)
X_test = add_features(X_test)

In [20]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [21]:
X_train = convert_types(X_train)
X_test = convert_types(X_test)

In [22]:
X_train = handle_skewed_distributions(X_train)
X_test = handle_skewed_distributions(X_test)

In [27]:
models = [
    linear_model.LogisticRegressionCV(),
    linear_model.SGDClassifier(),
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    neighbors.KNeighborsClassifier(),
    svm.SVC(probability=True),
    svm.LinearSVC(),
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
]

model_result_columns = ['Model Name', 'Parameters', 'Train Accuracy Mean', 'Test Accuracy Mean', 'Time']
model_result = pd.DataFrame(columns=model_result_columns)

cv_split = ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=0)

for index, model in enumerate(models):
    model_result.loc[index, 'Model Name'] = model.__class__.__name__
    model_result.loc[index, 'Parameters'] = str(model.get_params())

    cv_result = cross_validate(model, X_train, y_train, cv=cv_split, n_jobs=-1, return_train_score=True, scoring='f1_micro')
    
    model_result.loc[index, 'Time'] = cv_result['fit_time'].mean()
    model_result.loc[index, 'Train Accuracy Mean'] = cv_result['train_score'].mean()
    model_result.loc[index, 'Test Accuracy Mean'] = cv_result['test_score'].mean() 

model_result.sort_values(by=['Test Accuracy Mean'], ascending=False, inplace=True)
model_result

Unnamed: 0,Model Name,Parameters,Train Accuracy Mean,Test Accuracy Mean,Time
0,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1.0, 'l1_ratios': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'refit': True, 'scoring': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0}",0.991771,0.979792,0.504643
6,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}",0.988438,0.976875,0.011628
9,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}",1.0,0.972917,0.149313
12,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}",1.0,0.968125,0.321219
10,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",0.9975,0.965,0.047733
13,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",1.0,0.957083,0.257192
1,SGDClassifier,"{'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}",0.958958,0.953333,0.007721
3,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.955313,0.950833,0.005764
7,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}",1.0,0.950208,0.00923
11,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",1.0,0.948958,0.202897


In [51]:
def best_params_lgreg(X, y):
    param_grid_lgreg = [
        {
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['liblinear', 'saga'],
        },

        {
        'penalty': ['l2'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['newton-cg', 'lbfgs', 'sag'],
        },

        {
        'penalty': ['elasticnet'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['saga'],
        'l1_ratio': np.arange(0, 1, 0.01)
        }
    ]

    tune_model_lgreg = RandomizedSearchCV(linear_model.LogisticRegression(), param_distributions=param_grid_lgreg,
                                        scoring='f1_micro', cv=5, n_jobs=4, return_train_score=True, n_iter=50)
    tune_model_lgreg.fit(X, y)

    print("best parameters: ", tune_model_lgreg.best_params_)
    print("Mean train score: ", tune_model_lgreg.cv_results_['mean_train_score'][tune_model_lgreg.best_index_] * 100)
    print("Mean test Score: ", tune_model_lgreg.cv_results_['mean_test_score'][tune_model_lgreg.best_index_] * 100)

In [54]:
best_params_lgreg(X_train, y_train)

best parameters:  {'solver': 'saga', 'penalty': 'elasticnet', 'l1_ratio': 0.5, 'C': 3792.690190732246}
Mean train score:  98.90625
Mean test Score:  98.4375


In [32]:
model = linear_model.LogisticRegressionCV(Cs=np.linspace(500, 650, 20), solver='saga', penalty='elasticnet', l1_ratios=np.arange(0.3, 0.7, 0.05))
model.fit(X_train, y_train)
y_hat = model.predict(X_train)

print("--- Train ---")
print(confusion_matrix(y_train, y_hat))
print(classification_report(y_train, y_hat))

print("--- TEST ---")
y_hat_test = model.predict(X_test)
print(confusion_matrix(y_test, y_hat_test))
print(classification_report(y_test, y_hat_test))

--- Train ---
[[ 392    8]
 [   9 1191]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       400
           4       0.99      0.99      0.99      1200

    accuracy                           0.99      1600
   macro avg       0.99      0.99      0.99      1600
weighted avg       0.99      0.99      0.99      1600

--- TEST ---
[[ 99   1]
 [  6 294]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       100
           4       1.00      0.98      0.99       300

    accuracy                           0.98       400
   macro avg       0.97      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



### Handle imbalanced data

#### Undersampling

In [38]:
from imblearn.under_sampling import NearMiss

ns = NearMiss()
X_train_ns, y_train_ns = ns.fit_resample(X_train, y_train)
print('Number of classes before fit: {}'.format(Counter(y_train)))
print('Number of classes after fit: {}'.format(Counter(y_train_ns)))

Number of classes before fit: Counter({4: 1200, 0: 400})
Number of classes after fit: Counter({0: 400, 4: 400})


In [55]:
best_params_lgreg(X_train_ns, y_train_ns)

best parameters:  {'solver': 'saga', 'penalty': 'elasticnet', 'l1_ratio': 0.26, 'C': 206.913808111479}
Mean train score:  98.75
Mean test Score:  97.24999999999999


In [80]:
model_under = linear_model.LogisticRegressionCV(Cs=np.linspace(100, 300, 20), solver='saga', penalty='elasticnet', l1_ratios=np.arange(0.2, 0.4, 0.05))
model_under.fit(X_train_ns, y_train_ns)
y_hat_ns = model_under.predict(X_train_ns)

print("--- Train ---")
print(confusion_matrix(y_train_ns, y_hat_ns))
print(classification_report(y_train_ns, y_hat_ns))

print("--- TEST ---")
y_hat_test_ns = model_under.predict(X_test)
print(confusion_matrix(y_test, y_hat_test_ns))
print(classification_report(y_test, y_hat_test_ns))

--- Train ---
[[397   3]
 [  7 393]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       400
           4       0.99      0.98      0.99       400

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800

--- TEST ---
[[ 98   2]
 [  8 292]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       100
           4       0.99      0.97      0.98       300

    accuracy                           0.97       400
   macro avg       0.96      0.98      0.97       400
weighted avg       0.98      0.97      0.98       400



##### Oversampling

In [40]:
from imblearn.over_sampling import RandomOverSampler

os = RandomOverSampler()
X_train_os, y_train_os = os.fit_resample(X_train, y_train)
print('Number of classes before fit: {}'.format(Counter(y_train)))
print('Number of classes after fit: {}'.format(Counter(y_train_os)))

Number of classes before fit: Counter({4: 1200, 0: 400})
Number of classes after fit: Counter({4: 1200, 0: 1200})


In [56]:
best_params_lgreg(X_train_os, y_train_os)

best parameters:  {'solver': 'sag', 'penalty': 'l2', 'C': 3792.690190732246}
Mean train score:  99.125
Mean test Score:  98.83333333333333


In [81]:
model_over = linear_model.LogisticRegressionCV(Cs=np.linspace(3600, 3900, 20), solver='sag', penalty='l2')
model_over.fit(X_train_os, y_train_os)
y_hat_os = model_over.predict(X_train_os)

print("--- Train ---")
print(confusion_matrix(y_train_os, y_hat_os))
print(classification_report(y_train_os, y_hat_os))

print("--- TEST ---")
y_hat_test_os = model_over.predict(X_test)
print(confusion_matrix(y_test, y_hat_test_os))
print(classification_report(y_test, y_hat_test_os))

--- Train ---
[[1200    0]
 [  18 1182]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1200
           4       1.00      0.98      0.99      1200

    accuracy                           0.99      2400
   macro avg       0.99      0.99      0.99      2400
weighted avg       0.99      0.99      0.99      2400

--- TEST ---
[[100   0]
 [  9 291]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       100
           4       1.00      0.97      0.98       300

    accuracy                           0.98       400
   macro avg       0.96      0.98      0.97       400
weighted avg       0.98      0.98      0.98       400



##### SMOTETomek

In [77]:
from imblearn.over_sampling import SMOTENC

sm = SMOTENC(categorical_features=[1, 3, 4, 12, 13, 14])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
print('Number of classes before fit: {}'.format(Counter(y_train)))
print('Number of classes after fit: {}'.format(Counter(y_train_sm)))

Number of classes before fit: Counter({4: 1200, 0: 400})
Number of classes after fit: Counter({4: 1200, 0: 1200})


In [78]:
best_params_lgreg(X_train_sm, y_train_sm)

best parameters:  {'solver': 'saga', 'penalty': 'elasticnet', 'l1_ratio': 0.92, 'C': 1.623776739188721}
Mean train score:  98.89583333333334
Mean test Score:  98.70833333333333


In [85]:
model_sm = linear_model.LogisticRegressionCV(Cs=20, solver='saga', penalty='elasticnet', l1_ratios=np.arange(0.8, 1, 0.05))
model_sm.fit(X_train_sm, y_train_sm)
y_hat_sm = model_sm.predict(X_train_sm)

print("--- Train ---")
print(confusion_matrix(y_train_sm, y_hat_sm))
print(classification_report(y_train_sm, y_hat_sm))

print("--- TEST ---")
y_hat_test_sm = model_sm.predict(X_test)
print(confusion_matrix(y_test, y_hat_test_sm))
print(classification_report(y_test, y_hat_test_sm))

--- Train ---
[[1199    1]
 [  20 1180]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1200
           4       1.00      0.98      0.99      1200

    accuracy                           0.99      2400
   macro avg       0.99      0.99      0.99      2400
weighted avg       0.99      0.99      0.99      2400

--- TEST ---
[[100   0]
 [  8 292]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       100
           4       1.00      0.97      0.99       300

    accuracy                           0.98       400
   macro avg       0.96      0.99      0.97       400
weighted avg       0.98      0.98      0.98       400



#### handle imbalanced data using class weight

In [86]:
model = linear_model.LogisticRegressionCV(Cs=np.linspace(500, 650, 20), solver='saga',
                                          penalty='elasticnet', l1_ratios=np.arange(0.3, 0.7, 0.05),
                                          class_weight='balanced')
model.fit(X_train, y_train)
y_hat = model.predict(X_train)

print("--- Train ---")
print(confusion_matrix(y_train, y_hat))
print(classification_report(y_train, y_hat))

print("--- TEST ---")
y_hat_test = model.predict(X_test)
print(confusion_matrix(y_test, y_hat_test))
print(classification_report(y_test, y_hat_test))

--- Train ---
[[ 400    0]
 [  18 1182]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       400
           4       1.00      0.98      0.99      1200

    accuracy                           0.99      1600
   macro avg       0.98      0.99      0.99      1600
weighted avg       0.99      0.99      0.99      1600

--- TEST ---
[[100   0]
 [  9 291]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       100
           4       1.00      0.97      0.98       300

    accuracy                           0.98       400
   macro avg       0.96      0.98      0.97       400
weighted avg       0.98      0.98      0.98       400

