In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

|Task: To predict whether the user likes the mobile phone or not. <br>
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise not.

<b>Missing values:</b><br>
'Also Known As'(459),'Applications'(421),'Audio Features'(437),'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447)
'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467),
'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473),
'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456),
'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159), 'Video ,'Recording'(113),'Java'(471),'Browser'(448)

<b>Very low variance:</b><br>
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'


<b>Multivalued:</b><br>
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

<b>Not important:</b><br>
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

<b>Doubtful:</b><br>
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing),
'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'
    
<b>To check:</b><br>
'Display Type','Expandable Memory','FM Radio'

<b>High Correlation with other features</b><br>
'SIM Slot(s)' high correlation with SIM1
'Weight' has high high correlation with capacity , screen-to-body ratio
'Height' - screen size is also there
    
<b>Given a mobile, we can't directly get these features</b><br>
'Rating Count', 'Review Count'

<b>Keeping:</b><br>
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',


In [2]:
# read data from file
train = pd.read_csv("../input/train.csv") 
test = pd.read_csv("../input/test.csv")

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98


In [3]:
def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features

In [4]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing 

In [5]:
train = train[(train.isnull().sum(axis=1) <= 15)]
# You shouldn't remove data points from test set
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [6]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 47
Number of data points in test: 119
Number of features in test: 46


# Filling Missing values

In [7]:
def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass


In [8]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [9]:
train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 51
Number of data points in test: 119
Number of features in test: 50


Not very important feature

In [10]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data

In [11]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17


In [12]:
# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

(460, 17)
Index(['PhoneId', 'Screen Size', 'os_name', 'Pixel Density', 'Weight', 'SIM 2',
       'Num_cores', 'Capacity', 'Processor_frequency', 'Height', 'Resolution',
       'SIM Slot(s)', 'Internal Memory', 'RAM', 'Brand',
       'Screen to Body Ratio (calculated)', 'Sim1'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Screen Size', 'Pixel Density', 'Weight', 'Capacity',
       'Processor_frequency', 'Height', 'Resolution', 'Internal Memory', 'RAM',
       'Screen to Body Ratio (calculated)', 'os_name_Android',
       'os_name_Blackberry', 'os_name_KAI', 'os_name_Nokia', 'os_name_Other',
       'os_name_Tizen', 'os_name_iOS', 'SIM 2_2G', 'SIM 2_3G', 'SIM 2_4G',
       'SIM 2_Other', 'Num_cores_312', 'Num_cores_Deca', 'Num_cores_Dual',
       'Num_cores_Hexa', 'Num_cores_Octa', 'Num_cores_Other', 'Num_cores_Quad',
       'Num_cores_Tru-Octa', 'SIM Slot(s)_Dual SIM, GSM+CDMA',
       'SIM Slot(s)_Dual SIM, GSM+GSM',
       'SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE',
       'SIM

In [13]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [14]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 341
Number of features in train: 88
Number of data points in test: 119
Number of features in test: 87


In [None]:
train_new.head()

In [None]:
train[train['Brand'] == 'Nokia']

In [None]:
train_new[train_new['Rating'] == 4.1].groupby('Rating').mean()

In [None]:
train_new[['Rating']].groupby('RAM').mean()

In [None]:
train_new[['Rating','Weight']].groupby('Rating').mean()

In [None]:
(train_new['Internal Memory'])

In [None]:
train_new['Internal Memory'].count()

In [None]:
train_new[train_new['Rating'] == 5]

In [None]:
X_binarised_train.iloc[240]

In [None]:
train_new[train_new['PhoneId'] == 313]

In [None]:
train_new[train_new['Pixel Density'] >= 190 ].groupby('Rating').mean()


In [None]:
train_new['Height'].describe()

In [None]:
X_binarised_train.iloc[272]

In [None]:
train_

In [None]:
X_binarised_train.iloc[182]

In [None]:
train[train['RAM']==16]

In [None]:
test_new.head()

## Dummy Solution

In [None]:
submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class':[0]*test_new.shape[0]})
submission = submission[['PhoneId', 'Class']]
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
"""
WRITE YOUR MODELLING CODE HERE
"""

In [19]:
train_n = train_new.copy()

In [None]:
train_n['Weight']

In [44]:
train_new[['RAM','Rating']].groupby('RAM').mean()

Unnamed: 0_level_0,Rating
RAM,Unnamed: 1_level_1
0.15,4.2
0.3,4.1
0.45,4.1
0.5,3.44
1.0,3.824138
2.0,4.046269
3.0,3.983838
4.0,4.182979
6.0,4.358824
8.0,4.13


In [40]:
train_new['RAM']=train_n['RAM'].replace(512,0.5)
train_n = train_new.copy()

In [41]:
train_new['RAM']=train_n['RAM'].replace(16,0.15)
train_n = train_new.copy()

In [42]:
train_new['RAM']=train_n['RAM'].replace(32,0.3)
train_n = train_new.copy()

In [43]:
train_new['RAM']=train_n['RAM'].replace(64,0.45)
train_n = train_new.copy()

In [22]:
train_s = test_new

In [45]:
test_new['RAM'] = train_s['RAM'].replace(512,0.5)
train_s = test_new
test_new['RAM']=train_s['RAM'].replace(16,0.15)
train_s = test_new
test_new['RAM']=train_s['RAM'].replace(32,0.3)
train_s = test_new
test_new['RAM']=train_s['RAM'].replace(64,0.45)
train_s = test_new

In [83]:
x = train_new[train_new['Rating'] == 4.0]
x.mean()

PhoneId                             NaN
Screen Size                         NaN
Pixel Density                       NaN
Weight                              NaN
Capacity                            NaN
Processor_frequency                 NaN
Height                              NaN
Resolution                          NaN
Internal Memory                     NaN
RAM                                 NaN
Screen to Body Ratio (calculated)   NaN
os_name_Android                     NaN
os_name_Blackberry                  NaN
os_name_KAI                         NaN
os_name_Nokia                       NaN
os_name_Other                       NaN
os_name_Tizen                       NaN
os_name_iOS                         NaN
SIM 2_2G                            NaN
SIM 2_3G                            NaN
SIM 2_4G                            NaN
SIM 2_Other                         NaN
Num_cores_312                       NaN
Num_cores_Deca                      NaN
Num_cores_Dual                      NaN


In [48]:
X_binarised_3_train = train_n['Weight'].map(lambda x: 0 if x<160.105263 else 1)
train_new['Weight'] = X_binarised_3_train

In [49]:
X_binarised_3_train = train_n['Capacity'].map(lambda x: 0 if x<3176.315789 else 1)
train_new['Capacity'] = X_binarised_3_train

In [50]:
X_binarised_3_train = train_n['Processor_frequency'].map(lambda x: 0 if x<1.627632 else 1)
train_new['Processor_frequency'] = X_binarised_3_train

In [51]:
X_binarised_3_train = train_n['Screen Size'].map(lambda x: 0 if x<5.348158 else 1)
train_new['Screen Size'] = X_binarised_3_train

In [52]:
X_binarised_3_train = train_n['Internal Memory'].map(lambda x: 0 if x<34.526316 else 1)
train_new['Internal Memory'] = X_binarised_3_train

In [53]:
X_binarised_3_train = train_n['RAM'].map(lambda x: 0 if x<2.868421 else 1)
train_new['RAM'] = X_binarised_3_train

In [54]:
X_binarised_3_train = train_n['Height'].map(lambda x: 0 if x<150.006484 else 1)
train_new['Height'] = X_binarised_3_train

In [55]:
X_binarised_3_train = train_n['Screen to Body Ratio (calculated)'].map(lambda x: 0 if x<70.078421 else 1)
train_new['Screen to Body Ratio (calculated)'] = X_binarised_3_train

In [56]:
X_binarised_3_train = train_n['Resolution'].map(lambda x: 0 if x<9.394737 else 1)
train_new['Resolution'] = X_binarised_3_train

In [57]:
X_binarised_3_train = train_n['Pixel Density'].map(lambda x: 0 if x<324.263158	 else 1)
train_new['Pixel Density'] = X_binarised_3_train

In [58]:
X_binarised_3_train = train_n['Rating'].map(lambda x: 0 if x<4.0 else 1)
train_new['Rating'] = X_binarised_3_train

In [59]:
import matplotlib.pyplot as plt

In [60]:
train_new[train_new['Rating'] == 3.9].groupby('Rating').mean()

Unnamed: 0_level_0,PhoneId,Screen Size,Pixel Density,Weight,Capacity,Processor_frequency,Height,Resolution,Internal Memory,RAM,Screen to Body Ratio (calculated),os_name_Android,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,SIM 2_2G,SIM 2_3G,SIM 2_4G,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,"SIM Slot(s)_Dual SIM, GSM+CDMA","SIM Slot(s)_Dual SIM, GSM+GSM","SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE","SIM Slot(s)_Single SIM, GSM",Brand_10.or,Brand_Apple,Brand_Asus,Brand_Billion,Brand_Blackberry,Brand_Comio,...,Brand_InFocus,Brand_Infinix,Brand_Intex,Brand_Itel,Brand_Jivi,Brand_Karbonn,Brand_LG,Brand_Lava,Brand_LeEco,Brand_Lenovo,Brand_Lephone,Brand_Lyf,Brand_Meizu,Brand_Micromax,Brand_Mobiistar,Brand_Moto,Brand_Motorola,Brand_Nokia,Brand_Nubia,Brand_OPPO,Brand_OnePlus,Brand_Oppo,Brand_Panasonic,Brand_Razer,Brand_Realme,Brand_Reliance,Brand_Samsung,Brand_Sony,Brand_Spice,Brand_Tecno,Brand_Ulefone,Brand_VOTO,Brand_Vivo,Brand_Xiaomi,Brand_Xiaomi Poco,Brand_Yu,Brand_iVooMi,Sim1_2G,Sim1_3G,Sim1_4G
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1


In [61]:
X_binarised_3_train = train_s['Weight'].map(lambda x: 0 if x<160.105263 else 1)
test_new['Weight'] = X_binarised_3_train

In [62]:
X_binarised_3_train = train_s['Capacity'].map(lambda x: 0 if x<3176.315789 else 1)
test_new['Capacity'] = X_binarised_3_train

In [63]:
X_binarised_3_train = train_s['Processor_frequency'].map(lambda x: 0 if x<1.627632 else 1)
test_new['Processor_frequency'] = X_binarised_3_train

In [64]:
X_binarised_3_train = train_s['Screen Size'].map(lambda x: 0 if x<5.348158 else 1)
test_new['Screen Size'] = X_binarised_3_train

In [65]:
X_binarised_3_train = train_s['Internal Memory'].map(lambda x: 0 if x<34.526316 else 1)
test_new['Internal Memory'] = X_binarised_3_train

In [66]:
X_binarised_3_train = train_s['RAM'].map(lambda x: 0 if x<2.868421 else 1)
test_new['RAM'] = X_binarised_3_train

In [67]:
X_binarised_3_train = train_s['Height'].map(lambda x: 0 if x<150.006484 else 1)
test_new['Height'] = X_binarised_3_train

In [68]:
X_binarised_3_train = train_s['Screen to Body Ratio (calculated)'].map(lambda x: 0 if x<70.078421 else 1)
test_new['Screen to Body Ratio (calculated)'] = X_binarised_3_train

In [69]:
X_binarised_3_train = train_s['Resolution'].map(lambda x: 0 if x<9.394737 else 1)
test_new['Resolution'] = X_binarised_3_train

In [70]:
X_binarised_3_train = train_s['Pixel Density'].map(lambda x: 0 if x<324.263158 else 1)
test_new['Pixel Density'] = X_binarised_3_train

In [71]:
X_binarised_train = train_new
X_binarised_testF = test_new

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
X1 = X_binarised_train.drop('Rating',axis =1)
X = X1.drop('PhoneId',axis=1)
Y = X_binarised_train['Rating']
X_binarised_testF = X_binarised_testF.drop('PhoneId',axis = 1)
X_binarised_testF

Unnamed: 0,Screen Size,Pixel Density,Weight,Capacity,Processor_frequency,Height,Resolution,Internal Memory,RAM,Screen to Body Ratio (calculated),os_name_Android,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,SIM 2_2G,SIM 2_3G,SIM 2_4G,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,"SIM Slot(s)_Dual SIM, GSM+CDMA","SIM Slot(s)_Dual SIM, GSM+GSM","SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE","SIM Slot(s)_Single SIM, GSM",Brand_10.or,Brand_Apple,Brand_Asus,Brand_Billion,Brand_Blackberry,Brand_Comio,Brand_Coolpad,...,Brand_InFocus,Brand_Infinix,Brand_Intex,Brand_Itel,Brand_Jivi,Brand_Karbonn,Brand_LG,Brand_Lava,Brand_LeEco,Brand_Lenovo,Brand_Lephone,Brand_Lyf,Brand_Meizu,Brand_Micromax,Brand_Mobiistar,Brand_Moto,Brand_Motorola,Brand_Nokia,Brand_Nubia,Brand_OPPO,Brand_OnePlus,Brand_Oppo,Brand_Panasonic,Brand_Razer,Brand_Realme,Brand_Reliance,Brand_Samsung,Brand_Sony,Brand_Spice,Brand_Tecno,Brand_Ulefone,Brand_VOTO,Brand_Vivo,Brand_Xiaomi,Brand_Xiaomi Poco,Brand_Yu,Brand_iVooMi,Sim1_2G,Sim1_3G,Sim1_4G
0,1,0,1,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
5,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
6,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7,1,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
8,1,0,0,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
9,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [74]:
X_train, X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1,stratify = Y , random_state = 1)

In [75]:
X_binarised_test = X_test.values
X_binarised_train = X_train.values

In [76]:
max_acc = 0
max_b = 0

    
for b in range(X_binarised_train.shape[1] +1):
    Y_pred_train = []
    accurate_row = 0

    for x,y in zip(X_binarised_train,Y_train):
        y_pred = (np.sum(x) >= b)
        Y_pred_train.append(y_pred)
        accurate_row += (y == y_pred)
    print(b, accurate_row/X_binarised_train.shape[0])
    if((accurate_row/X_binarised_train.shape[0])>max_acc):
        max_acc = accurate_row/X_binarised_train.shape[0]
        max_b = b
print(max_acc,max_b)

0 0.6993464052287581
1 0.6993464052287581
2 0.6993464052287581
3 0.6993464052287581
4 0.6993464052287581
5 0.6993464052287581
6 0.6993464052287581
7 0.7026143790849673
8 0.6633986928104575
9 0.6405228758169934
10 0.6339869281045751
11 0.6209150326797386
12 0.5882352941176471
13 0.5490196078431373
14 0.477124183006536
15 0.4444444444444444
16 0.39869281045751637
17 0.3006535947712418
18 0.3006535947712418
19 0.3006535947712418
20 0.3006535947712418
21 0.3006535947712418
22 0.3006535947712418
23 0.3006535947712418
24 0.3006535947712418
25 0.3006535947712418
26 0.3006535947712418
27 0.3006535947712418
28 0.3006535947712418
29 0.3006535947712418
30 0.3006535947712418
31 0.3006535947712418
32 0.3006535947712418
33 0.3006535947712418
34 0.3006535947712418
35 0.3006535947712418
36 0.3006535947712418
37 0.3006535947712418
38 0.3006535947712418
39 0.3006535947712418
40 0.3006535947712418
41 0.3006535947712418
42 0.3006535947712418
43 0.3006535947712418
44 0.3006535947712418
45 0.300653594771241

In [77]:
from sklearn.metrics import accuracy_score
b = 7
Y_pred_test = []
accurate_row = 0
for x in zip(X_binarised_test):
  y_pred = (np.sum(x) >= b)
  Y_pred_test.append(y_pred)
accuracy = accuracy_score(Y_pred_test,Y_test)
print(b,accuracy)

7 0.7714285714285715


In [78]:
X_final = X_binarised_testF.values

In [79]:
b = 7
Y_pred_test = []
for x in X_final:
    if (np.sum(x)>= b):
        y_pred = 1
    else:
        y_pred = 0
    Y_pred_test.append(y_pred)
print(Y_pred_test)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1]


In [80]:
os.chdir("/kaggle/working/")

In [81]:
submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class':Y_pred_test})
submission = submission[['PhoneId', 'Class']]
submission.describe()

Unnamed: 0,PhoneId,Class
count,119.0,119.0
mean,240.176471,0.890756
std,136.804614,0.313264
min,3.0,0.0
25%,130.0,1.0
50%,245.0,1.0
75%,348.5,1.0
max,473.0,1.0


In [82]:
submission.to_csv("submission.csv", index=False)