In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

Task: To predict whether the user likes the mobile phone or not. <br>
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise not.

<b>Missing values:</b><br>
'Also Known As'(459),'Applications'(421),'Audio Features'(437),'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447)
'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467),
'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473),
'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456),
'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159), 'Video ,'Recording'(113),'Java'(471),'Browser'(448)

<b>Very low variance:</b><br>
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'


<b>Multivalued:</b><br>
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

<b>Not important:</b><br>
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

<b>Doubtful:</b><br>
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing),
'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'
    
<b>To check:</b><br>
'Display Type','Expandable Memory','FM Radio'

<b>High Correlation with other features</b><br>
'SIM Slot(s)' high correlation with SIM1
'Weight' has high high correlation with capacity , screen-to-body ratio
'Height' - screen size is also there
    
<b>Given a mobile, we can't directly get these features</b><br>
'Rating Count', 'Review Count'

<b>Keeping:</b><br>
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',


In [2]:
# read data from file
# train = pd.read_csv("../input/train.csv") 
# test = pd.read_csv("../input/test.csv")
train = pd.read_csv("train.csv") 
test = pd.read_csv("test.csv")


# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98


In [3]:
def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features

In [4]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing 

In [5]:
train = train[(train.isnull().sum(axis=1) <= 15)]
# You shouldn't remove data points from test set
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [6]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 47
Number of data points in test: 119
Number of features in test: 46


# Filling Missing values

In [7]:
def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass


In [8]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [9]:
train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 51
Number of data points in test: 119
Number of features in test: 50


Not very important feature

In [10]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data

In [11]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17


In [12]:
# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

(460, 17)
Index(['PhoneId', 'Capacity', 'SIM Slot(s)', 'os_name', 'Height',
       'Pixel Density', 'SIM 2', 'Brand', 'Resolution', 'RAM',
       'Screen to Body Ratio (calculated)', 'Weight', 'Num_cores',
       'Processor_frequency', 'Internal Memory', 'Sim1', 'Screen Size'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Capacity', 'Height', 'Pixel Density', 'Resolution', 'RAM',
       'Screen to Body Ratio (calculated)', 'Weight', 'Processor_frequency',
       'Internal Memory', 'Screen Size', 'SIM Slot(s)_Dual SIM, GSM+CDMA',
       'SIM Slot(s)_Dual SIM, GSM+GSM',
       'SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE',
       'SIM Slot(s)_Single SIM, GSM', 'os_name_Android', 'os_name_Blackberry',
       'os_name_KAI', 'os_name_Nokia', 'os_name_Other', 'os_name_Tizen',
       'os_name_iOS', 'SIM 2_2G', 'SIM 2_3G', 'SIM 2_4G', 'SIM 2_Other',
       'Brand_10.or', 'Brand_Apple', 'Brand_Asus', 'Brand_Billion',
       'Brand_Blackberry', 'Brand_Comio', 'Brand_Coolpad', 'Brand_Do',
   

In [13]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [14]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 341
Number of features in train: 88
Number of data points in test: 119
Number of features in test: 87


In [15]:
train_new.head()

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G,Rating
0,0,4000,157.9,403,20,4,80.68,182,1.8,64,...,0,0,1,0,0,0,0,0,1,4.5
1,1,4230,156.2,271,8,3,80.85,168,1.8,32,...,0,0,1,0,0,0,0,0,1,4.5
2,2,3500,157.0,409,25,3,83.68,168,2.1,32,...,0,0,1,0,0,0,0,0,1,4.4
3,4,3300,159.8,411,24,4,74.78,169,2.2,64,...,0,0,1,0,0,0,0,0,1,4.3
4,5,3750,160.4,396,16,4,84.23,175,2.2,64,...,0,0,1,0,0,0,0,0,1,4.4


In [16]:
test_new.head()

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G
0,3,4230,156.2,271,5,2,80.85,168,1.8,16,...,0,0,0,1,0,0,0,0,0,1
1,11,5000,156.0,402,12,4,81.6,205,1.8,64,...,0,0,0,1,0,0,0,0,0,1
2,13,3500,156.7,409,25,6,83.84,169,2.0,64,...,0,0,0,1,0,0,0,0,0,1
3,16,3500,156.7,409,16,4,83.84,169,2.0,64,...,0,0,0,1,0,0,0,0,0,1
4,19,4000,158.6,403,20,4,77.43,181,1.8,64,...,0,0,0,1,0,0,0,0,0,1


## Dummy Solution

In [17]:
# submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class':[1]*test_new.shape[0]})
# submission = submission[['PhoneId', 'Class']]
# submission.head()

# submission.to_csv("submission.csv", index=False)

In [18]:
train_new.head()

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G,Rating
0,0,4000,157.9,403,20,4,80.68,182,1.8,64,...,0,0,1,0,0,0,0,0,1,4.5
1,1,4230,156.2,271,8,3,80.85,168,1.8,32,...,0,0,1,0,0,0,0,0,1,4.5
2,2,3500,157.0,409,25,3,83.68,168,2.1,32,...,0,0,1,0,0,0,0,0,1,4.4
3,4,3300,159.8,411,24,4,74.78,169,2.2,64,...,0,0,1,0,0,0,0,0,1,4.3
4,5,3750,160.4,396,16,4,84.23,175,2.2,64,...,0,0,1,0,0,0,0,0,1,4.4


In [19]:
# Last column contains the ratings i.e. the label
print(train_new.shape, test_new.shape)

(341, 88) (119, 87)


In [20]:
# Labels have not been binarized
train_new['Rating'][25:40]

25    4.3
26    4.3
27    4.6
28    4.4
29    4.5
30    3.9
31    4.2
32    4.3
33    4.2
34    4.3
35    4.3
36    4.2
37    4.4
38    4.1
39    3.8
Name: Rating, dtype: float64

In [21]:
# Binarize the labels
train_new['Rating'] = train_new['Rating'].apply(lambda x: 1 if x>=4 else 0)

In [22]:
train_new['Rating'][25:40]

25    1
26    1
27    1
28    1
29    1
30    0
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    0
Name: Rating, dtype: int64

In [23]:
print(train_new.shape)
train_new_modi = train_new[train_new.RAM < 16]
print(train_new_modi.shape)

(341, 88)
(333, 88)


In [24]:
train_new.Weight.dtype

dtype('int32')

In [25]:
# Split train_new into X and y.
y = train_new_modi['Rating']
X = train_new_modi.drop('Rating', axis = 1)  # Not inplace, train_new is not affected

In [26]:
print(X.shape, y.shape)

(333, 87) (333,)


In [27]:
# Verifying that y contains the labels. Also data imbalance is observed hence stratified split is necessary in next step
y.value_counts()

1    235
0     98
Name: Rating, dtype: int64

In [28]:
# Split the training data into train and val set

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42, stratify = y, test_size = 0.1)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(y_train.mean(), y_val.mean())

(299, 87) (34, 87) (299,) (34,)
0.705685618729097 0.7058823529411765


In [29]:
X_train = X_train.astype('int64')
X_val = X_val.astype('int64')
print(X_train.dtypes,X_val.dtypes)


PhoneId                                      int64
Capacity                                     int64
Height                                       int64
Pixel Density                                int64
Resolution                                   int64
RAM                                          int64
Screen to Body Ratio (calculated)            int64
Weight                                       int64
Processor_frequency                          int64
Internal Memory                              int64
Screen Size                                  int64
SIM Slot(s)_Dual SIM, GSM+CDMA               int64
SIM Slot(s)_Dual SIM, GSM+GSM                int64
SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE    int64
SIM Slot(s)_Single SIM, GSM                  int64
os_name_Android                              int64
os_name_Blackberry                           int64
os_name_KAI                                  int64
os_name_Nokia                                int64
os_name_Other                  

## Normalizing and Standardizing the data

In [30]:
# Normalization/Standardization should not be done for one hot encoded columns
not_onehot = list()
onehot = list()
for i in X_train.columns:
    if X_train[i].dtype != 'uint8':
        not_onehot.append(i)
    else:
        onehot.append(i)

In [31]:
# Normalizing
# X_train[not_onehot]=(X_train[not_onehot]-X_train[not_onehot].min())/(X_train[not_onehot].max()-X_train[not_onehot].min())
# X_val[not_onehot]=(X_val[not_onehot]-X_val[not_onehot].min())/(X_val[not_onehot].max()-X_val[not_onehot].min())


# Standardizing
X_train[not_onehot]=(X_train[not_onehot]-X_train[not_onehot].median())
X_val[not_onehot]=(X_val[not_onehot]-X_val[not_onehot].median())

# Z-score
# X_train[not_onehot]=(X_train[not_onehot]-X_train[not_onehot].mean())/X_train[not_onehot].std()
# X_val[not_onehot]=(X_val[not_onehot]-X_val[not_onehot].mean())/X_val[not_onehot].std()

## Binarization of data

In [32]:
# # Custom binarization for standardized data. 
X_train = X_train.applymap(lambda x: 1 if x>0 else 0)
X_val = X_val.applymap(lambda x: 1 if x>0 else 0)


## Custom binarization for normalized data. 
# X_train = X_train.applymap(lambda x: 1 if x>0.5 else 0)
# X_val = X_val.applymap(lambda x: 1 if x>0.5 else 0)

## Using pd.cut
X_train[not_onehot] = X_train[not_onehot].apply(pd.cut, bins=2, labels=[0, 1])
X_val[not_onehot] = X_val[not_onehot].apply(pd.cut, bins=2, labels=[0, 1])


In [33]:
X_train.dtypes

PhoneId                                      category
Capacity                                     category
Height                                       category
Pixel Density                                category
Resolution                                   category
RAM                                          category
Screen to Body Ratio (calculated)            category
Weight                                       category
Processor_frequency                          category
Internal Memory                              category
Screen Size                                  category
SIM Slot(s)_Dual SIM, GSM+CDMA               category
SIM Slot(s)_Dual SIM, GSM+GSM                category
SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE    category
SIM Slot(s)_Single SIM, GSM                  category
os_name_Android                              category
os_name_Blackberry                           category
os_name_KAI                                  category
os_name_Nokia               

In [34]:
X_train.head(10)

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G
163,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
262,1,0,1,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
19,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
318,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
97,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
186,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
194,1,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
332,1,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40,0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
train_new.iloc[[163, 262, 19, 241, 318, 97, 186, 194, 332, 40], :]

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G,Rating
163,221,4000,148.0,295,5,3,73.05,161,1.5,32,...,0,0,0,0,1,0,0,0,1,0
262,364,3080,153.0,267,16,4,71.36,153,1.4,64,...,0,0,1,0,0,0,0,0,1,0
19,28,4000,158.4,269,8,3,80.64,160,1.8,32,...,0,0,1,0,0,0,0,0,1,1
241,330,2600,142.4,294,5,2,67.91,138,1.5,16,...,0,0,0,0,1,0,0,0,1,1
318,442,2000,132.7,218,5,1,63.45,138,1.1,8,...,0,0,0,0,1,0,0,0,1,0
97,129,4000,148.0,295,5,2,73.05,161,1.5,16,...,0,0,0,0,1,0,0,0,1,1
186,251,3020,145.2,294,13,2,65.95,156,1.4,16,...,0,0,0,0,1,0,0,0,1,0
194,267,3800,158.1,402,8,8,78.45,185,2.35,128,...,0,0,1,0,0,0,0,0,1,0
332,458,3000,154.3,401,13,3,70.39,160,1.4,32,...,0,0,1,0,0,0,0,0,1,1
40,57,3500,157.0,409,25,4,83.68,168,2.1,64,...,0,0,1,0,0,0,0,0,1,1


## MP Neuron Class

In [36]:
from sklearn.metrics import accuracy_score
class MPNeuron:
  
    def __init__(self):
        self.b = None
    
    def model(self, x):
        return int(sum(x) >= self.b)
  
    def predict(self, X):
        Y = []
        for x in X:
            result = self.model(x)
            Y.append(result)
        return np.array(Y)
  
    def fit(self, X, Y):
        accuracy = {}

        for b in range(X.shape[1] + 1):
            self.b = b
            Y_pred = self.predict(X)
            accuracy[b] = accuracy_score(Y_pred, Y)
#             print(b, accuracy[b])
        best_b = max(accuracy, key = accuracy.get)
        self.b = best_b
        
        return best_b, accuracy[best_b]

## Remove uncorrelated columns

In [37]:
absCorrWithTar = list()
target = y_train
for col in X_train.columns:
    feature  = X_train[col]
    absCorrWithTar.append((col, abs(target.corr(feature))))

absCorrWithTar.sort(key = lambda x: x[1])

NUM_COLUMNS_DROP = 15
remove_cols = [i[0] for i in absCorrWithTar[:NUM_COLUMNS_DROP]]
print('Removed columns:\n', remove_cols)

print(X_train.shape, X_val.shape)


columns_to_retain = list(set(X_train.columns)-set(remove_cols))


X_train = X_train[columns_to_retain]
X_val = X_val[columns_to_retain]


print(X_train.shape, X_val.shape)

Removed columns:
 ['Weight', 'Brand_Infinix', 'os_name_Other', 'Brand_Itel', 'Brand_Gionee', 'SIM Slot(s)_Dual SIM, GSM+CDMA', 'Brand_Comio', 'os_name_Nokia', 'Brand_Asus', 'Pixel Density', 'SIM 2_3G', 'Brand_Google', 'SIM Slot(s)_Single SIM, GSM', 'SIM 2_Other', 'Brand_Huawei']
(299, 87) (34, 87)
(299, 72) (34, 72)


In [38]:
X_train = X_train.astype('int64')
X_val = X_val.astype('int64')

### Grid search on percentage

In [39]:
percentage = np.linspace(0, 0.1, num = 100)

val_accs = list()
for perc in percentage:
    
    # Reassign the non edited dataframes after each loop
    X_train_temp = X_train.copy()
    X_val_temp = X_val.copy()


    # Find which columns to drop based on number of positive cases. If very few positive cases(percentage) for the feature. Drop it.
    one_hot_drop = list()
    for i in X_train.columns:
        if X_train[i].shape[0]*perc > X_train[i].sum(axis=0):
            one_hot_drop.append(i)

    # Drop the columns
    X_train_temp.drop(one_hot_drop, axis = 1, inplace=True)
    X_val_temp.drop(one_hot_drop, axis = 1, inplace=True)
    
    X_train_temp.drop('PhoneId', axis = 1, inplace=True)
    X_val_temp.drop('PhoneId', axis = 1, inplace=True)

    # Instantiate new model
    mp_neuron = MPNeuron()
    b, train_acc = mp_neuron.fit(X_train_temp.values, y_train.values)
    
    print('Percentage         :', perc)
    print('No.of features     :', X_train_temp.shape[1])
    print('Optimal value for b:', b)
    print('Training accuracy  :', train_acc)
    
    Y_val_pred = mp_neuron.predict(X_val_temp.values)
    val_acc = accuracy_score(Y_val_pred, y_val)

    print('Validation accuracy:',val_acc)
    print('-'*50)
    
    val_accs.append((train_acc, val_acc, perc, b))

    
max_val = 0
Final_config = None
for i in val_accs:
    if i[1]>max_val:
        Final_config = i
        max_val = i[1]

print(Final_config)

Percentage         : 0.0
No.of features     : 71
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.00101010101010101
No.of features     : 59
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.00202020202020202
No.of features     : 59
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.0030303030303030303
No.of features     : 59
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.00404040404040404
No.of features     : 48
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accu

Percentage         : 0.04242424242424243
No.of features     : 20
Optimal value for b: 2
Training accuracy  : 0.7090301003344481
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.043434343434343436
No.of features     : 20
Optimal value for b: 2
Training accuracy  : 0.7090301003344481
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.044444444444444446
No.of features     : 19
Optimal value for b: 2
Training accuracy  : 0.7090301003344481
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.045454545454545456
No.of features     : 19
Optimal value for b: 2
Training accuracy  : 0.7090301003344481
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.046464646464646465
No.of features     : 19
Optimal value for b: 2
Training accuracy  : 0.709030100

Percentage         : 0.08686868686868687
No.of features     : 12
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.08787878787878788
No.of features     : 12
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.08888888888888889
No.of features     : 12
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.0898989898989899
No.of features     : 12
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Validation accuracy: 0.7058823529411765
--------------------------------------------------
Percentage         : 0.09090909090909091
No.of features     : 12
Optimal value for b: 0
Training accuracy  : 0.705685618729097
Va

In [40]:
X_train.head(10)

Unnamed: 0,os_name_Tizen,Num_cores_Dual,Brand_Lyf,Brand_Spice,os_name_KAI,Sim1_2G,Brand_Nubia,Brand_10.or,Brand_OnePlus,Capacity,...,Sim1_4G,Brand_Razer,Brand_Xiaomi Poco,Resolution,Brand_iVooMi,Brand_Blackberry,Brand_Billion,Brand_Apple,Brand_Do,Brand_HTC
163,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
194,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
40,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [41]:
# train_new.iloc[['39', '41', '26', '254', '295'], :]
# train_new.iloc[[39, 41, 26, 254, 295, 117, 1, 135, 58, 122], :]
# train_new.iloc[[290, 15, 64, 252, 20, 152, 328, 334, 178, 258], :]
train_new.iloc[[163, 262, 19, 241, 318, 97, 186, 194, 332, 40]]

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G,Rating
163,221,4000,148.0,295,5,3,73.05,161,1.5,32,...,0,0,0,0,1,0,0,0,1,0
262,364,3080,153.0,267,16,4,71.36,153,1.4,64,...,0,0,1,0,0,0,0,0,1,0
19,28,4000,158.4,269,8,3,80.64,160,1.8,32,...,0,0,1,0,0,0,0,0,1,1
241,330,2600,142.4,294,5,2,67.91,138,1.5,16,...,0,0,0,0,1,0,0,0,1,1
318,442,2000,132.7,218,5,1,63.45,138,1.1,8,...,0,0,0,0,1,0,0,0,1,0
97,129,4000,148.0,295,5,2,73.05,161,1.5,16,...,0,0,0,0,1,0,0,0,1,1
186,251,3020,145.2,294,13,2,65.95,156,1.4,16,...,0,0,0,0,1,0,0,0,1,0
194,267,3800,158.1,402,8,8,78.45,185,2.35,128,...,0,0,1,0,0,0,0,0,1,0
332,458,3000,154.3,401,13,3,70.39,160,1.4,32,...,0,0,1,0,0,0,0,0,1,1
40,57,3500,157.0,409,25,4,83.68,168,2.1,64,...,0,0,1,0,0,0,0,0,1,1


## Final Model

In [42]:
# (0.7324414715719063, 0.7647058823529411, 0.0101010101010101, 6)  RS = 42 split = 0.1; Standardized median; all 1s
# (0.7123745819397993, 0.7647058823529411, 0.0, 7) RS = 42 split = 0.1; pd.cuts; all 1s
# (0.7123745819397993, 0.7647058823529411, 0.0, 7) RS = 42 split= 0.1; min-max; all 1s
# (0.7123745819397993, 0.7647058823529411, 0.0, 7) RS = 42 split = 0.1; norm-cut; all 1s
# (0.7293233082706767, 0.746268656716418, 0.012121212121212121, 6) RS = 42 split = 0.2; std median
# (0.7357859531772575, 0.7941176470588235, 0.0101010101010101, 6)  dropped weight column, RS = 42, split = 0.1, std median



perc = 0.0101010101010101

mp_neuron = MPNeuron()

X_train_temp = X_train.copy()
X_val_temp = X_val.copy()


# Find which columns to drop
one_hot_drop = list()
for i in onehot:
    if X_train[i].shape[0]*perc > X_train[i].sum(axis=0):
        one_hot_drop.append(i)

# Drop the columns
X_train_temp.drop(one_hot_drop, axis = 1, inplace=True)
X_val_temp.drop(one_hot_drop, axis = 1, inplace=True)
X_train_temp.drop('PhoneId', axis = 1, inplace=True)
X_val_temp.drop('PhoneId', axis = 1, inplace=True)


mp_neuron = MPNeuron()
mp_neuron.fit(X_train_temp.values, y_train.values)

Y_val_pred = mp_neuron.predict(X_val_temp.values)
val_acc = accuracy_score(Y_val_pred, y_val)
print('Max Validation accuracy:', val_acc)

Max Validation accuracy: 0.7058823529411765


## Submission

In [43]:
# temp = test_new['PhoneId']

# # Custom binarization for standardized data. 
# test_new[not_onehot]=(test_new[not_onehot]-test_new[not_onehot].median())
# test_new = test_new.applymap(lambda x: 1 if x>0 else 0)
# print(test_new.shape)
# test_new.drop(one_hot_drop, axis = 1, inplace=True)
# test_new.drop('PhoneId',axis = 1, inplace=True)
# print(test_new.shape)

## Custom binarization for normalized data. 
# test_new[not_onehot]=(test_new[not_onehot]-test_new[not_onehot].min())/(test_new[not_onehot].max()-test_new[not_onehot].min())
# test_new = test_new.applymap(lambda x: 1 if x>0.5 else 0)
# test_new.drop('PhoneId',axis = 1, inplace=True)

## Using pd.cut
# print(test_new.shape)
# test_new[not_onehot] = test_new[not_onehot].apply(pd.cut, bins=2, labels=[0, 1])
# test_new.drop(one_hot_drop, axis = 1, inplace=True)
# test_new.drop('PhoneId',axis = 1, inplace=True)
# print(test_new.shape)


## Using z-score and pd.cut
# test_new[not_onehot]=(test_new[not_onehot]-test_new[not_onehot].mean())/test_new[not_onehot].std()
# test_new[not_onehot] = test_new[not_onehot].apply(pd.cut, bins=2, labels=[0, 1])
# test_new.drop('PhoneId',axis = 1, inplace=True)
# test_new.drop('Weight', axis = 1) 


# Predict using model
# test_new_preds = mp_neuron.predict(test_new.values)

# # Create dataframe
# submission = pd.DataFrame({'PhoneId':temp, 'Class':test_new_preds})
# submission = submission[['PhoneId', 'Class']]
# submission.head()

# # Write to csv file
# submission.to_csv("./mp_neuron/submission_median_new_0.1_dropintest.csv", index=False)


In [44]:
test_new.head()

Unnamed: 0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G
0,3,4230,156.2,271,5,2,80.85,168,1.8,16,...,0,0,0,1,0,0,0,0,0,1
1,11,5000,156.0,402,12,4,81.6,205,1.8,64,...,0,0,0,1,0,0,0,0,0,1
2,13,3500,156.7,409,25,6,83.84,169,2.0,64,...,0,0,0,1,0,0,0,0,0,1
3,16,3500,156.7,409,16,4,83.84,169,2.0,64,...,0,0,0,1,0,0,0,0,0,1
4,19,4000,158.6,403,20,4,77.43,181,1.8,64,...,0,0,0,1,0,0,0,0,0,1


In [45]:
X_train_temp.head()

Unnamed: 0,os_name_Tizen,Num_cores_Dual,Brand_Lyf,Brand_Spice,os_name_KAI,Sim1_2G,Brand_Nubia,Brand_10.or,Brand_OnePlus,Capacity,...,Sim1_4G,Brand_Razer,Brand_Xiaomi Poco,Resolution,Brand_iVooMi,Brand_Blackberry,Brand_Billion,Brand_Apple,Brand_Do,Brand_HTC
163,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
X_train_temp.shape

(299, 71)

In [47]:
train_new.groupby('Rating').mean()

Unnamed: 0_level_0,PhoneId,Capacity,Height,Pixel Density,Resolution,RAM,Screen to Body Ratio (calculated),Weight,Processor_frequency,Internal Memory,...,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Sim1_2G,Sim1_3G,Sim1_4G
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,299.485437,2984.805825,146.980823,327.116505,7.601942,27.543689,69.088544,158.84466,1.568447,30.330097,...,0.019417,0.038835,0.009709,0.436893,0.009709,0.485437,0.0,0.009709,0.048544,0.941748
1,204.184874,3280.676471,151.229813,351.609244,10.453782,4.012605,73.766261,162.42437,1.889538,53.310924,...,0.0,0.02521,0.046218,0.642857,0.021008,0.256303,0.004202,0.02521,0.008403,0.966387


In [48]:
absCorrWithTar = list()
target = y_train
for col in X_train_temp.columns:
    feature  = X_train_temp[col]
    absCorrWithTar.append((col, abs(target.corr(feature))))
# print(absCorrWithTar)
absCorrWithTar.sort(key = lambda x: x[1])
# print(absCorrWithTar[:15])
rem_cols = [i[0] for i in absCorrWithTar[:15]]
print(rem_cols)


['os_name_Tizen', 'Num_cores_Dual', 'Brand_Lyf', 'Brand_Spice', 'os_name_KAI', 'Sim1_2G', 'Brand_Lava', 'Brand_Lenovo', 'Brand_Moto', 'Brand_Oppo', 'Brand_Ulefone', 'Num_cores_Tru-Octa', 'Brand_Motorola', 'Brand_Nokia', 'Brand_Panasonic']


In [49]:
print(absCorrWithTar[-10:])

[('Internal Memory', 0.1953408948734712), ('Num_cores_Quad', 0.1955004105888857), ('Screen to Body Ratio (calculated)', 0.2026945606608697), ('SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE', 0.21025257233834413), ('Screen Size', 0.2590262178823988), ('os_name_Android', nan), ('Brand_Reliance', nan), ('Sim1_4G', nan), ('Brand_Billion', nan), ('Brand_Do', nan)]
