In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

# read data from file
# train = pd.read_csv("../input/train.csv") 
# test = pd.read_csv("../input/test.csv")
train = pd.read_csv("train.csv") 
test = pd.read_csv("test.csv")


# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features
train = data_clean(train)
test = data_clean(test)

train = train[(train.isnull().sum(axis=1) <= 15)]

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98
Number of data points in train: 341
Number of features in train: 47
Number of data points in test: 119
Number of features in test: 46
Number of data points in train: 341
Number of features in train: 51
Number of data points in test: 119
Number of features in test: 50
Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17
(460, 17)
Index(['PhoneId', 'Capacity', 'Num_cores', 'Screen to Body Ratio (calculated)',
       'os_name', 'SIM Slot(s)', 'Brand', 'Internal Memory', 'Resolution',
       'Pixel Density', 'SIM 2', 'RAM', 'Height', 'Screen Size', 'Weight',
       'Processor_frequency', 'Sim1'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Capacity', 'Screen to Body Ratio (calculated)',
       'Internal Memory', 'Resolution', 'Pixel Density', 'RAM', 'Height',
       'Scre

## Solution

In [2]:
print(train_new.shape, test_new.shape)

(341, 88) (119, 87)


In [3]:
train_new['Rating'][25:40]

25    4.3
26    4.3
27    4.6
28    4.4
29    4.5
30    3.9
31    4.2
32    4.3
33    4.2
34    4.3
35    4.3
36    4.2
37    4.4
38    4.1
39    3.8
Name: Rating, dtype: float64

### Binarize the target labels

In [4]:
train_new['Rating'] = train_new['Rating'].apply(lambda x: 1 if x>=4 else 0)
train_new['Rating'][25:40]

25    1
26    1
27    1
28    1
29    1
30    0
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    0
Name: Rating, dtype: int64

In [5]:
print(train_new.shape)
train_new_modi = train_new[train_new.RAM < 16]
print(train_new_modi.shape)

(341, 88)
(333, 88)


### Split train_new into X and y.

In [6]:
y = train_new_modi['Rating']
X = train_new_modi.drop('Rating', axis = 1)  # Not inplace, train_new is not affected

In [7]:
print(X.shape, y.shape)
print(y.value_counts())

(333, 87) (333,)
1    235
0     98
Name: Rating, dtype: int64


### Split the training data into training and validation set

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42, stratify = y, test_size = 0.1)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(y_train.mean(), y_val.mean())

(299, 87) (34, 87) (299,) (34,)
0.705685618729097 0.7058823529411765


### Standardizing the data

Standardization should not be done for one hot encoded columns

In [9]:
# Find all columns which are one-hot encoded
not_onehot = list()
onehot = list()
for i in X_train.columns:
    if X_train[i].dtype != 'uint8':
        not_onehot.append(i)
    else:
        onehot.append(i)

Standardizing by median instead of mean, makes it robust to outliers. We do not divide by standard deviation due to the way we binarize the data below. Dividing/Not dividing by standard deviation will not have any impact on the final binarized data. Hence, it was not scaled by the standard deviation.

In [10]:
X_train[not_onehot]=(X_train[not_onehot]-X_train[not_onehot].median())
X_val[not_onehot]=(X_val[not_onehot]-X_val[not_onehot].median())

### Binarization of data

MP neuron model only accepts binary inputs. We take all values below median as 0 and above median as 1.

In [11]:
X_train = X_train.applymap(lambda x: 1 if x>0 else 0)
X_val = X_val.applymap(lambda x: 1 if x>0 else 0)

In [12]:
X_train.shape

(299, 87)

In [13]:
X_train.head(10)

Unnamed: 0,PhoneId,Capacity,Screen to Body Ratio (calculated),Internal Memory,Resolution,Pixel Density,RAM,Height,Screen Size,Weight,...,Brand_Xiaomi Poco,Brand_Yu,Brand_iVooMi,SIM 2_2G,SIM 2_3G,SIM 2_4G,SIM 2_Other,Sim1_2G,Sim1_3G,Sim1_4G
163,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
262,1,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
19,0,1,1,0,0,0,0,1,1,0,...,0,0,0,0,0,1,0,0,0,1
241,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
318,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
97,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
186,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
194,1,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
332,1,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
40,0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1


In [14]:
train_new.iloc[[163, 262, 19, 241, 318, 97, 186, 194, 332, 40], :]

Unnamed: 0,PhoneId,Capacity,Screen to Body Ratio (calculated),Internal Memory,Resolution,Pixel Density,RAM,Height,Screen Size,Weight,...,Brand_Yu,Brand_iVooMi,SIM 2_2G,SIM 2_3G,SIM 2_4G,SIM 2_Other,Sim1_2G,Sim1_3G,Sim1_4G,Rating
163,221,4000,73.05,32,5,295,3,148.0,5.45,161,...,1,0,0,0,1,0,0,0,1,0
262,364,3080,71.36,64,16,267,4,153.0,5.5,153,...,0,0,0,0,1,0,0,0,1,0
19,28,4000,80.64,32,8,269,3,158.4,6.26,160,...,0,0,0,0,1,0,0,0,1,1
241,330,2600,67.91,16,5,294,2,142.4,5.0,138,...,0,0,0,1,0,0,0,0,1,1
318,442,2000,63.45,8,5,218,1,132.7,4.5,138,...,0,0,0,0,1,0,0,0,1,0
97,129,4000,73.05,16,5,295,2,148.0,5.45,161,...,1,0,0,0,1,0,0,0,1,1
186,251,3020,65.95,16,13,294,2,145.2,5.0,156,...,0,0,0,0,1,0,0,0,1,0
194,267,3800,78.45,128,8,402,8,158.1,6.0,185,...,0,0,0,0,1,0,0,0,1,0
332,458,3000,70.39,32,13,401,3,154.3,5.5,160,...,0,0,0,0,1,0,0,0,1,1
40,57,3500,83.68,64,25,409,4,157.0,6.3,168,...,0,0,0,0,1,0,0,0,1,1


## MP Neuron Class

In [15]:
from sklearn.metrics import accuracy_score
class MPNeuron:
  
    def __init__(self):
        self.b = None
    
    def model(self, x):
        return int(sum(x) >= self.b)
  
    def predict(self, X):
        Y = []
        for x in X:
            result = self.model(x)
            Y.append(result)
        return np.array(Y)
  
    def fit(self, X, Y):
        accuracy = {}

        for b in range(X.shape[1] + 1):
            self.b = b
            Y_pred = self.predict(X)
            accuracy[b] = accuracy_score(Y_pred, Y)
        best_b = max(accuracy, key = accuracy.get)
        self.b = best_b
        
        return best_b, accuracy[best_b]

## Remove unneccessary one-hot columns

Many one-hot encoded columns contain very few positive cases i.e. value is 1.

Hypothesis: these features will not help the mp neuron model to generalize. As the drawback in mp neuron model is that all columns/features are given equal weight.

In [16]:
for col in onehot[:10]:
    temp = X[col].value_counts().values
    try:
        print('0:', temp[0], '; 1:', temp[1])
    except:
        print('0:', temp[0], '; 1: 0')

0: 333 ; 1: 0
0: 331 ; 1: 2
0: 326 ; 1: 7
0: 321 ; 1: 12
0: 198 ; 1: 135
0: 329 ; 1: 4
0: 224 ; 1: 109
0: 332 ; 1: 1
0: 307 ; 1: 26
0: 331 ; 1: 2


## Removing uncorrelated columns

In [17]:
absCorrWithTar = list()
target = y_train
for col in X_train.columns:
    feature  = X_train[col]
    absCorrWithTar.append((col, abs(target.corr(feature))))

absCorrWithTar.sort(key = lambda x: x[1])

In [26]:
absCorrWithTar

[('Weight', 0.0021237628878150454),
 ('os_name_Other', 0.008619438945241703),
 ('Num_cores_Other', 0.011322760005844552),
 ('Brand_Gionee', 0.016122478763279702),
 ('Num_cores_Tru-Octa', 0.03741036766243989),
 ('SIM Slot(s)_Dual SIM, GSM+CDMA', 0.03741036766243989),
 ('Brand_Comio', 0.03741036766244008),
 ('os_name_Nokia', 0.03741036766244023),
 ('Brand_Asus', 0.05145602725439722),
 ('Num_cores_Dual', 0.05145602725439727),
 ('Pixel Density', 0.053972280300152754),
 ('Brand_Google', 0.0650151966155363),
 ('SIM Slot(s)_Single SIM, GSM', 0.06907527555641513),
 ('Brand_Huawei', 0.07520021739730647),
 ('Num_cores_Hexa', 0.08722700633704403),
 ('Brand_Honor', 0.10839584423014782),
 ('os_name_Android', 0.1097543511049269),
 ('Height', 0.12131366415089138),
 ('Num_cores_Deca', 0.12706813763039565),
 ('Brand_10.or', 0.12706813763039565),
 ('Brand_Coolpad', 0.12706813763039573),
 ('Brand_InFocus', 0.1270681376303958),
 ('os_name_Blackberry', 0.127068137630396),
 ('Resolution', 0.1284267478223511

In [18]:
X_train.drop('PhoneId', axis = 1, inplace=True)
X_val.drop('PhoneId', axis = 1, inplace=True)

In [19]:
X_train_fixed = X_train.copy()
X_val_fixed = X_val.copy()

In [20]:
print(X_train.shape)

(299, 86)


In [21]:
# percentage = np.linspace(0, 0.2, num = 50)
# val_accs = list()

# for i in range(len(X_train.columns)//2):
    
#     NUM_COLUMNS_DROP = i
#     remove_cols = [i[0] for i in absCorrWithTar[:NUM_COLUMNS_DROP]]

#     columns_to_retain = list(set(X_train_fixed.columns)-set(remove_cols))
    

    
#     X_train = X_train_fixed.copy()
#     X_val = X_val_fixed.copy()
    
#     print(X_train.shape, X_val.shape)
#     X_train = X_train[columns_to_retain]
#     X_val = X_val[columns_to_retain]

#     print('Number of columns dropped: ', i)
#     print(X_train.shape, X_val.shape)
#     ## Grid Search on percentage

    
#     for perc in percentage:

#         # Reassign the non edited dataframes after each loop
#         X_train_temp = X_train.copy()
#         X_val_temp = X_val.copy()

        
#         # Find which columns to drop based on number of positive cases. If very few positive cases(percentage) for the feature. Drop it.
#         one_hot_drop = list()
#         for i in onehot:
#             try:
#                 if X_train[i].shape[0]*perc > X_train[i].sum(axis=0):
#                     one_hot_drop.append(i)
#             except: 
#                 pass

#         # Drop the columns
#         X_train_temp.drop(one_hot_drop, axis = 1, inplace=True)
#         X_val_temp.drop(one_hot_drop, axis = 1, inplace=True)


#         # Instantiate new model
#         mp_neuron = MPNeuron()
#         b, train_acc = mp_neuron.fit(X_train_temp.values, y_train.values)

# #         print('Percentage removed :', perc)
# #         print('No.of features     :', X_train_temp.shape[1])
# #         print('Optimal value for b:', b)
# #         print('Training accuracy  :', train_acc)

#         Y_val_pred = mp_neuron.predict(X_val_temp.values)
#         val_acc = accuracy_score(Y_val_pred, y_val)

# #         print('Validation accuracy:',val_acc)
        
#         val_accs.append((train_acc, val_acc, perc, b, NUM_COLUMNS_DROP))
#     print('-'*50)


# max_val = 0
# Final_config = None
# for i in val_accs:
#     if i[1]>max_val:
#         Final_config = i
#         max_val = i[1]
# print(Final_config)

## Final model

Using the best percentage from the grid search, we train the final model.

In [23]:
# (0.7357859531772575, 0.7941176470588235, 0.012244897959183675, 6, 1)
perc = 0.012244897959183675

mp_neuron = MPNeuron()

X_train_temp = X_train_fixed.copy()
X_val_temp = X_val_fixed.copy()


# Find which columns to drop
one_hot_drop = list()
for i in onehot:
    if X_train_fixed[i].shape[0]*perc > X_train_fixed[i].sum(axis=0):
        one_hot_drop.append(i)

# Drop the columns
X_train_temp.drop(one_hot_drop, axis = 1, inplace=True)
X_val_temp.drop(one_hot_drop, axis = 1, inplace=True)

try:
    X_train_temp.drop('Weight', axis = 1 , inplace = True)
    X_val_temp.drop('Weight', axis = 1, inplace=True)

    X_train_temp.drop('PhoneId', axis = 1, inplace=True)
    X_val_temp.drop('PhoneId', axis = 1, inplace=True)
except:
    pass


mp_neuron = MPNeuron()
_, train_acc = mp_neuron.fit(X_train_temp.values, y_train.values)

Y_val_pred = mp_neuron.predict(X_val_temp.values)
val_acc = accuracy_score(Y_val_pred, y_val)
print('Max training accuracy:', train_acc)
print('Max Validation accuracy:', val_acc)

Max training accuracy: 0.7357859531772575
Max Validation accuracy: 0.7941176470588235


## Submission of csv file

Same transformation have to be performed on test set as the ones performed on training set. This is the preprocessing step before using our model for prediction.

In [24]:
temp = test_new['PhoneId']

# Drop PhoneId in test set as well as the insignificant one hot encoded columns. Percentage to drop must be same.

test_new[not_onehot]=(test_new[not_onehot]-test_new[not_onehot].median())
test_new = test_new.applymap(lambda x: 1 if x>0 else 0)
# print(test_new.shape)
test_new.drop(one_hot_drop, axis = 1, inplace=True)
test_new.drop('PhoneId',axis = 1, inplace=True)
test_new.drop('Weight', axis = 1, inplace=True)
# print(test_new.shape)

In [25]:
# Predict using model
test_new_preds = mp_neuron.predict(test_new.values)

# Create dataframe
submission = pd.DataFrame({'PhoneId':temp, 'Class':test_new_preds})
submission = submission[['PhoneId', 'Class']]
submission.head()

# Write to csv file
submission.to_csv("./mp_neuron/submission.csv", index=False)