In [1]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
sns.set_style("dark")
sns.set(rc={'figure.figsize':(12,8)})
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler 
from sklearn import linear_model
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
def convert_types(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Converts categorical variables to their appropriate type
    Arguments:
        df is a n-by-d pandas data frame
    Returns:
        the final dataframe with corrected categorical data types
    '''

    df_copy = df.copy(deep=True)
    df_copy['blue'] = df.blue.astype('category')
    df_copy['dual_sim'] = df.dual_sim.astype('category')
    df_copy['four_g'] = df.four_g.astype('category')
    df_copy['three_g'] = df.three_g.astype('category')
    df_copy['touch_screen'] = df.touch_screen.astype('category')
    df_copy['wifi'] = df.wifi.astype('category')
    return df_copy

In [4]:
px_whole = df['px_width'] * df['px_height']
sc_whole = df['sc_w'] * df['sc_h']

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Extract features form the original dataset
    Arguments:
        df is a n-by-d pandas data frame
    Returns:
        the final dataframe with extracted features
    '''

    df_copy = df.copy(deep=True)
    df_copy['px_whole'] = px_whole
    df_copy['sc_whole'] = sc_whole
    df_copy.drop(['px_width', 'px_height', 'sc_h', 'sc_w'], axis=1, inplace=True)
    return df_copy

In [7]:
df_X = df.drop('price_range', axis=1)
df_y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y,test_size=0.2, stratify=df_y, random_state=42)
print("Train size: ", X_train.shape, y_train.shape)
print("Test size: ", X_test.shape, y_test.shape)

Train size:  (1600, 20) (1600,)
Test size:  (400, 20) (400,)


In [8]:
X_train = add_features(X_train)
X_test = add_features(X_test)

Making different dataset using different scailing techniques

In [11]:
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
max_abs_scaler = MaxAbsScaler()
robust_scaler = RobustScaler()

In [12]:
def scale(scaler, train, test):
    '''
    Creates train and test set using given scaler
    Arguments:
        scaler is an object of a specific scailing technique
        train is pandas dataframe which specify train set
        test is pandas dataframe which specify test set
    Returns:
        scaled train and test sets
    '''

    res_train = scaler.fit_transform(train)
    res_test = scaler.transform(test)
    return res_train, res_test

In [13]:
X_train_min_max, X_test_min_max = scale(min_max_scaler, X_train, X_test)
X_train_standard, X_test_standard = scale(standard_scaler, X_train, X_test)
X_train_max_abs, X_test_max_abs = scale(max_abs_scaler, X_train, X_test)
X_train_robust, X_test_robust = scale(robust_scaler, X_train, X_test)

In [26]:
def calc_perfomance(X_train, X_test, y_train, y_test, tech):
    '''
    Calculates the performacne of the final model on the training and test set using given technique.
    Arguments:
        X_train is pandas dataframe which specifies the input train set
        X_test is pandas dataframe which specifies the input test set
        y_train is pandas series which specifies the target train set
        y_test is pandas series which specifies the target test set
        tech is a string which specifies the technique which was used
    '''
    
    model = linear_model.LogisticRegressionCV(Cs=np.linspace(3600, 3900, 20), solver='lbfgs', penalty='l2')
    model.fit(X_train, y_train)
    y_hat = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    print("technique: ", tech, end='\n\n')
    print("--- Train ---")
    print(confusion_matrix(y_train, y_hat))
    print(classification_report(y_train, y_hat))
    print("--- TEST ---")
    print(confusion_matrix(y_test, y_hat_test))
    print(classification_report(y_test, y_hat_test))

In [27]:
calc_perfomance(X_train, X_test, y_train, y_test, "None")

technique:  None

--- Train ---
[[287  89  24   0]
 [ 87 160 101  52]
 [  4 105  82 209]
 [  0  17  18 365]]
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       400
           1       0.43      0.40      0.42       400
           2       0.36      0.20      0.26       400
           3       0.58      0.91      0.71       400

    accuracy                           0.56      1600
   macro avg       0.53      0.56      0.53      1600
weighted avg       0.53      0.56      0.53      1600

--- TEST ---
[[77 14  9  0]
 [16 43 25 16]
 [ 1 28 23 48]
 [ 0  4  6 90]]
              precision    recall  f1-score   support

           0       0.82      0.77      0.79       100
           1       0.48      0.43      0.46       100
           2       0.37      0.23      0.28       100
           3       0.58      0.90      0.71       100

    accuracy                           0.58       400
   macro avg       0.56      0.58      0.56       400
we

In [28]:
calc_perfomance(X_train_max_abs, X_test_max_abs, y_train, y_test, "max_abs")

technique:  max_abs

--- Train ---
[[394   6   0   0]
 [ 12 374  14   0]
 [  0  15 368  17]
 [  0   0  15 385]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       400
           1       0.95      0.94      0.94       400
           2       0.93      0.92      0.92       400
           3       0.96      0.96      0.96       400

    accuracy                           0.95      1600
   macro avg       0.95      0.95      0.95      1600
weighted avg       0.95      0.95      0.95      1600

--- TEST ---
[[98  2  0  0]
 [ 3 91  6  0]
 [ 0  4 91  5]
 [ 0  0  5 95]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       100
           1       0.94      0.91      0.92       100
           2       0.89      0.91      0.90       100
           3       0.95      0.95      0.95       100

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400

In [30]:
calc_perfomance(X_train_min_max, X_test_min_max, y_train, y_test, "min_max")

technique:  min_max

--- Train ---
[[394   6   0   0]
 [ 12 374  14   0]
 [  0  15 368  17]
 [  0   0  15 385]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       400
           1       0.95      0.94      0.94       400
           2       0.93      0.92      0.92       400
           3       0.96      0.96      0.96       400

    accuracy                           0.95      1600
   macro avg       0.95      0.95      0.95      1600
weighted avg       0.95      0.95      0.95      1600

--- TEST ---
[[98  2  0  0]
 [ 3 91  6  0]
 [ 0  4 91  5]
 [ 0  0  5 95]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       100
           1       0.94      0.91      0.92       100
           2       0.89      0.91      0.90       100
           3       0.95      0.95      0.95       100

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400

In [31]:
calc_perfomance(X_train_robust, X_test_robust, y_train, y_test, "robust")

technique:  robust

--- Train ---
[[393   7   0   0]
 [ 12 374  14   0]
 [  0  15 368  17]
 [  0   0  15 385]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       400
           1       0.94      0.94      0.94       400
           2       0.93      0.92      0.92       400
           3       0.96      0.96      0.96       400

    accuracy                           0.95      1600
   macro avg       0.95      0.95      0.95      1600
weighted avg       0.95      0.95      0.95      1600

--- TEST ---
[[98  2  0  0]
 [ 3 91  6  0]
 [ 0  5 90  5]
 [ 0  0  5 95]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       100
           1       0.93      0.91      0.92       100
           2       0.89      0.90      0.90       100
           3       0.95      0.95      0.95       100

    accuracy                           0.94       400
   macro avg       0.93      0.94      0.93       400


In [32]:
calc_perfomance(X_train_standard, X_test_standard, y_train, y_test, "standard")

technique:  standard

--- Train ---
[[393   7   0   0]
 [ 12 374  14   0]
 [  0  15 368  17]
 [  0   0  15 385]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       400
           1       0.94      0.94      0.94       400
           2       0.93      0.92      0.92       400
           3       0.96      0.96      0.96       400

    accuracy                           0.95      1600
   macro avg       0.95      0.95      0.95      1600
weighted avg       0.95      0.95      0.95      1600

--- TEST ---
[[98  2  0  0]
 [ 3 91  6  0]
 [ 0  5 90  5]
 [ 0  0  5 95]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       100
           1       0.93      0.91      0.92       100
           2       0.89      0.90      0.90       100
           3       0.95      0.95      0.95       100

    accuracy                           0.94       400
   macro avg       0.93      0.94      0.93       40

### Influence analysis of pca on model performance

In [37]:
def calc_perfomance_pca(X_train, X_test, y_train, y_test, n_components):
    '''
    Calculates the performacne of the final model on the training and test set using given pov of PCA.
    Arguments:
        X_train is pandas dataframe which specifies the input train set
        X_test is pandas dataframe which specifies the input test set
        y_train is pandas series which specifies the target train set
        y_test is pandas series which specifies the target test set
        n_components is a float which specifies Number of components to keep in PCA
    '''
    
    pca = PCA(n_components=n_components)
    train = pca.fit_transform(X_train)
    test = pca.transform(X_test)

    model = linear_model.LogisticRegressionCV(solver='lbfgs', penalty='l2')
    model.fit(train, y_train)
    y_hat = model.predict(train)
    y_hat_test = model.predict(test)
    print("n_components: ", n_components, end='\n\n')
    print("--- Train ---")
    print(confusion_matrix(y_train, y_hat))
    print(classification_report(y_train, y_hat))
    print("--- TEST ---")
    print(confusion_matrix(y_test, y_hat_test))
    print(classification_report(y_test, y_hat_test))

In [38]:
n_components_list = [0.99, 0.95, 0.9, 0.8, 0.75, 0.2]
for n_component in n_components_list:
    calc_perfomance(X_train, X_test, y_train, y_test, n_component)

technique:  0.99

--- Train ---
[[287  89  24   0]
 [ 87 160 101  52]
 [  4 105  82 209]
 [  0  17  18 365]]
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       400
           1       0.43      0.40      0.42       400
           2       0.36      0.20      0.26       400
           3       0.58      0.91      0.71       400

    accuracy                           0.56      1600
   macro avg       0.53      0.56      0.53      1600
weighted avg       0.53      0.56      0.53      1600

--- TEST ---
[[77 14  9  0]
 [16 43 25 16]
 [ 1 28 23 48]
 [ 0  4  6 90]]
              precision    recall  f1-score   support

           0       0.82      0.77      0.79       100
           1       0.48      0.43      0.46       100
           2       0.37      0.23      0.28       100
           3       0.58      0.90      0.71       100

    accuracy                           0.58       400
   macro avg       0.56      0.58      0.56       400
we