In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
mingw_path = 'C:\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from datetime import date
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
import xgboost as xgb

In [2]:
datadir = ''
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')

In [3]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [4]:
gatrain.head()

Unnamed: 0_level_0,gender,age,group,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8076087639492063270,M,35,M32-38,0
-2897161552818060146,M,35,M32-38,1
-8260683887967679142,M,35,M32-38,2
-4938849341048082022,M,30,M29-31,3
245133531816851882,M,30,M29-31,4


In [5]:
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
phone.head()

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [6]:
phone.shape

(187245, 3)

In [7]:
phone.duplicated().sum()

523

In [8]:
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
phone.shape

(186716, 2)

In [9]:
phone.head()

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [10]:
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'])
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [11]:
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'),
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
appevents.head()

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False
2,2,-1633887856876571208,False
3,2,-653184325010919369,True
4,2,8693964245073640147,True


In [12]:
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
applabels.head()

Unnamed: 0,app_id,label_id
0,7324884708820027918,251
1,-4494216993218550286,251
2,6058196446775239644,406
3,6058196446775239644,407
4,8694625920731541625,406


In [13]:
labelcategories = pd.read_csv(os.path.join(datadir,'label_categories.csv'))
labelcategories.head()

Unnamed: 0,label_id,category
0,1,
1,2,game-game type
2,3,game-Game themes
3,4,game-Art Style
4,5,game-Leisure time


In [14]:
print('Number of unique categores: {}, Number of unique categores in apps {}, Number of categories that match {}'.format(labelcategories.shape[0],len(applabels.label_id.unique()),labelcategories.label_id.isin(applabels.label_id).sum()))

Number of unique categores: 930, Number of unique categores in apps 507, Number of categories that match 507


# Device Brand

In [15]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


# Device Model

In [16]:
m = phone.phone_brand.str.cat(phone.device_model)
m.head()

device_id
-8890648629457979026               小米红米
 1277779817574759137             小米MI 2
 5137427614288105724        三星Galaxy S4
 3669464369358936369          SUGAR时尚手机
-5019277647504317457    三星Galaxy Note 2
Name: phone_brand, dtype: object

In [17]:
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [18]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
appevents.head()

Unnamed: 0,event_id,app_id,is_active,app
0,2,5927333115845830913,True,15408
1,2,-5720078949152207372,False,3384
2,2,-1633887856876571208,False,7620
3,2,-653184325010919369,True,8902
4,2,8693964245073640147,True,18686


In [19]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [20]:
latLong = pd.read_csv("latLong.csv")
latLong.head()

Unnamed: 0,longitude,latitude,region,regionLabel
0,-180.0,0.0,,
1,-155.99,19.82,1.0,USA
2,-122.32,47.64,1.0,USA
3,-122.16,37.45,1.0,USA
4,-101.76,38.28,1.0,USA


In [21]:
events = events.merge(latLong[['longitude','latitude','regionLabel']],how='left',left_on=['longitude','latitude'],right_on=['longitude','latitude'])
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,regionLabel
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,China
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,China
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,China
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,China
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,China


In [22]:
events = events.drop(['longitude','latitude'],axis=1)
events.head()

Unnamed: 0,event_id,device_id,timestamp,regionLabel
0,1,29182687948017175,2016-05-01 00:55:25,China
1,2,-6401643145415154744,2016-05-01 00:54:12,China
2,3,-4833982096941402721,2016-05-01 00:08:05,China
3,4,-6815121365017318426,2016-05-01 00:06:40,China
4,5,-5373797595892518570,2016-05-01 00:07:18,China


In [23]:
events.dtypes

event_id                int64
device_id               int64
timestamp      datetime64[ns]
regionLabel            object
dtype: object

In [24]:
events['day_of_week']=events.timestamp.dt.dayofweek
events['time_of_day']=events.timestamp.dt.hour
events['period']=pd.cut(events.time_of_day,bins=[0,5,12,17,21,24],right=False,labels=[0,1,2,3,4])
events['period'].replace(4,0,inplace=True)
# 0:Night,1:Morning,2:Afternoon,3:Evening
events =events.drop(['time_of_day','timestamp'],axis=1)
events.head()

Unnamed: 0,event_id,device_id,regionLabel,day_of_week,period
0,1,29182687948017175,China,6,0
1,2,-6401643145415154744,China,6,0
2,3,-4833982096941402721,China,6,0
3,4,-6815121365017318426,China,6,0
4,5,-5373797595892518570,China,6,0


In [25]:
appevents.head()

Unnamed: 0,event_id,app_id,is_active,app
0,2,5927333115845830913,True,15408
1,2,-5720078949152207372,False,3384
2,2,-1633887856876571208,False,7620
3,2,-653184325010919369,True,8902
4,2,8693964245073640147,True,18686


In [26]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
appevents.head()

Unnamed: 0,event_id,app_id,is_active,app
0,2,5927333115845830913,True,15408
1,2,-5720078949152207372,False,3384
2,2,-1633887856876571208,False,7620
3,2,-653184325010919369,True,8902
4,2,8693964245073640147,True,18686


In [27]:
deviceapps=appevents.merge(events,how='left',left_on=['event_id'],right_on=['event_id'])
deviceapps.head()

Unnamed: 0,event_id,app_id,is_active,app,device_id,regionLabel,day_of_week,period
0,2,5927333115845830913,True,15408,-6401643145415154744,China,6,0
1,2,-5720078949152207372,False,3384,-6401643145415154744,China,6,0
2,2,-1633887856876571208,False,7620,-6401643145415154744,China,6,0
3,2,-653184325010919369,True,8902,-6401643145415154744,China,6,0
4,2,8693964245073640147,True,18686,-6401643145415154744,China,6,0


# Apps and Activity

In [28]:
appactivity = deviceapps.groupby(['device_id','app'])['is_active'].agg(['size',np.sum,np.mean]).reset_index()
appactivity=(appactivity.merge(gatrain[['trainrow']],how='left',left_on='device_id',right_index=True)
.merge(gatest[['testrow']],how='left',left_on='device_id',right_index=True))
appactivity.head()

Unnamed: 0,device_id,app,size,sum,mean,trainrow,testrow
0,-9222956879900151005,548,18,4.0,0.222222,21594.0,
1,-9222956879900151005,1096,18,0.0,0.0,21594.0,
2,-9222956879900151005,1248,26,15.0,0.576923,21594.0,
3,-9222956879900151005,1545,12,2.0,0.166667,21594.0,
4,-9222956879900151005,1664,18,0.0,0.0,21594.0,


In [29]:
d = appactivity.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = appactivity.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [30]:
d = appactivity.dropna(subset=['trainrow'])
Xtr_app_size = csr_matrix((d['size'], (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = appactivity.dropna(subset=['testrow'])
Xte_app_size = csr_matrix((d['size'], (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app_size.shape, Xte_app_size.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [31]:
d = appactivity.dropna(subset=['trainrow'])
Xtr_app_act = csr_matrix((np.log1p(d['sum']), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = appactivity.dropna(subset=['testrow'])
Xte_app_act = csr_matrix((np.log1p(d['sum']), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app_act.shape, Xte_app_act.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [32]:
d = appactivity.dropna(subset=['trainrow'])
Xtr_app_act_m = csr_matrix((d['mean'], (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = appactivity.dropna(subset=['testrow'])
Xte_app_act_m = csr_matrix((d['mean'], (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app_act_m.shape, Xte_app_act_m.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


# Location and Activity

In [33]:
locactivity = deviceapps.groupby(['device_id','regionLabel'])['is_active'].agg(['size',np.sum,np.mean]).reset_index()
locactivity=(locactivity.merge(gatrain[['trainrow']],how='left',left_on='device_id',right_index=True)
.merge(gatest[['testrow']],how='left',left_on='device_id',right_index=True))
locactivity.head()

Unnamed: 0,device_id,regionLabel,size,sum,mean,trainrow,testrow
0,-9222956879900151005,China,1069,481.0,0.449953,21594.0,
1,-9221825537663503111,China,429,237.0,0.552448,,106929.0
2,-9221026417907250887,China,363,210.0,0.578512,27155.0,
3,-9220061629197656378,China,1168,38.0,0.032534,74103.0,
4,-9220053820290758471,China,34,27.0,0.794118,,97141.0


In [34]:
locEncoder = LabelEncoder().fit(locactivity['regionLabel'])
locactivity['loc'] = locEncoder.transform(locactivity['regionLabel'])
nlocs = len(locEncoder.classes_)
locactivity.head()

Unnamed: 0,device_id,regionLabel,size,sum,mean,trainrow,testrow,loc
0,-9222956879900151005,China,1069,481.0,0.449953,21594.0,,5
1,-9221825537663503111,China,429,237.0,0.552448,,106929.0,5
2,-9221026417907250887,China,363,210.0,0.578512,27155.0,,5
3,-9220061629197656378,China,1168,38.0,0.032534,74103.0,,5
4,-9220053820290758471,China,34,27.0,0.794118,,97141.0,5


In [35]:
d = locactivity.dropna(subset=['trainrow'])
Xtr_loc = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d['loc'])), 
                      shape=(gatrain.shape[0],nlocs))
d = locactivity.dropna(subset=['testrow'])
Xte_loc = csr_matrix((np.ones(d.shape[0]), (d.testrow, d['loc'])), 
                      shape=(gatest.shape[0],nlocs))
print('Apps data: train shape {}, test shape {}'.format(Xtr_loc.shape, Xte_loc.shape))

Apps data: train shape (74645, 35), test shape (112071, 35)


In [36]:
d = locactivity.dropna(subset=['trainrow'])
Xtr_loc_size = csr_matrix((d['size'], (d.trainrow, d['loc'])), 
                      shape=(gatrain.shape[0],nlocs))
d = locactivity.dropna(subset=['testrow'])
Xte_loc_size = csr_matrix((d['size'], (d.testrow, d['loc'])), 
                      shape=(gatest.shape[0],nlocs))
print('Apps data: train shape {}, test shape {}'.format(Xtr_loc_size.shape, Xte_loc_size.shape))

Apps data: train shape (74645, 35), test shape (112071, 35)


In [37]:
d = locactivity.dropna(subset=['trainrow'])
Xtr_loc_act = csr_matrix((np.log1p(d['sum']), (d.trainrow, d['loc'])), 
                      shape=(gatrain.shape[0],nlocs))
d = locactivity.dropna(subset=['testrow'])
Xte_loc_act = csr_matrix((np.log1p(d['sum']), (d.testrow, d['loc'])), 
                      shape=(gatest.shape[0],nlocs))
print('Apps data: train shape {}, test shape {}'.format(Xtr_loc_act.shape, Xte_loc_act.shape))

Apps data: train shape (74645, 35), test shape (112071, 35)


In [38]:
d = locactivity.dropna(subset=['trainrow'])
Xtr_loc_act_m = csr_matrix((d['mean'], (d.trainrow, d['loc'])), 
                      shape=(gatrain.shape[0],nlocs))
d = locactivity.dropna(subset=['testrow'])
Xte_loc_act_m = csr_matrix((d['mean'], (d.testrow, d['loc'])), 
                      shape=(gatest.shape[0],nlocs))
print('Apps data: train shape {}, test shape {}'.format(Xtr_loc_act_m.shape, Xte_loc_act_m.shape))

Apps data: train shape (74645, 35), test shape (112071, 35)


# Week day and Activity

In [39]:
weekactivity = deviceapps.groupby(['device_id','day_of_week'])['is_active'].agg(['size',np.sum,np.mean]).reset_index()
weekactivity=(weekactivity.merge(gatrain[['trainrow']],how='left',left_on='device_id',right_index=True)
.merge(gatest[['testrow']],how='left',left_on='device_id',right_index=True))
weekactivity.head()

Unnamed: 0,device_id,day_of_week,size,sum,mean,trainrow,testrow
0,-9222956879900151005,4,627,292.0,0.46571,21594.0,
1,-9222956879900151005,5,877,418.0,0.476625,21594.0,
2,-9222661944218806987,0,7,7.0,1.0,,13612.0
3,-9222661944218806987,1,11,7.0,0.636364,,13612.0
4,-9222661944218806987,2,10,7.0,0.7,,13612.0


In [40]:
d = weekactivity.dropna(subset=['trainrow'])
Xtr_week = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d['day_of_week'])), 
                      shape=(gatrain.shape[0],7))
d = weekactivity.dropna(subset=['testrow'])
Xte_week = csr_matrix((np.ones(d.shape[0]), (d.testrow, d['day_of_week'])), 
                      shape=(gatest.shape[0],7))
print('Apps data: train shape {}, test shape {}'.format(Xtr_week.shape, Xte_week.shape))

Apps data: train shape (74645, 7), test shape (112071, 7)


In [41]:
d = weekactivity.dropna(subset=['trainrow'])
Xtr_week_size = csr_matrix((d['size'], (d.trainrow, d['day_of_week'])), 
                      shape=(gatrain.shape[0],7))
d = weekactivity.dropna(subset=['testrow'])
Xte_week_size = csr_matrix((d['size'], (d.testrow, d['day_of_week'])), 
                      shape=(gatest.shape[0],7))
print('Apps data: train shape {}, test shape {}'.format(Xtr_week_size.shape, Xte_week_size.shape))

Apps data: train shape (74645, 7), test shape (112071, 7)


In [42]:
d = weekactivity.dropna(subset=['trainrow'])
Xtr_week_act = csr_matrix((np.log1p(d['sum']), (d.trainrow, d['day_of_week'])), 
                      shape=(gatrain.shape[0],7))
d = weekactivity.dropna(subset=['testrow'])
Xte_week_act = csr_matrix((np.log1p(d['sum']), (d.testrow, d['day_of_week'])), 
                      shape=(gatest.shape[0],7))
print('Apps data: train shape {}, test shape {}'.format(Xtr_week_act.shape, Xte_week_act.shape))

Apps data: train shape (74645, 7), test shape (112071, 7)


In [43]:
d = weekactivity.dropna(subset=['trainrow'])
Xtr_week_act_m = csr_matrix((d['mean'], (d.trainrow, d['day_of_week'])), 
                      shape=(gatrain.shape[0],7))
d = weekactivity.dropna(subset=['testrow'])
Xte_week_act_m = csr_matrix((d['mean'], (d.testrow, d['day_of_week'])), 
                      shape=(gatest.shape[0],7))
print('Apps data: train shape {}, test shape {}'.format(Xtr_week_act_m.shape, Xte_week_act_m.shape))

Apps data: train shape (74645, 7), test shape (112071, 7)


# Day time & Activity

In [44]:
periodactivity = deviceapps.groupby(['device_id','period'])['is_active'].agg(['size',np.sum,np.mean]).reset_index()
periodactivity=(periodactivity.merge(gatrain[['trainrow']],how='left',left_on='device_id',right_index=True)
.merge(gatest[['testrow']],how='left',left_on='device_id',right_index=True))
periodactivity.head()

Unnamed: 0,device_id,period,size,sum,mean,trainrow,testrow
0,-9222956879900151005,0,202,110.0,0.544554,21594.0,
1,-9222956879900151005,1,407,103.0,0.253071,21594.0,
2,-9222956879900151005,2,809,411.0,0.508035,21594.0,
3,-9222956879900151005,3,86,86.0,1.0,21594.0,
4,-9222661944218806987,0,27,19.0,0.703704,,13612.0


In [45]:
d = periodactivity.dropna(subset=['trainrow'])
Xtr_period = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d['period'])), 
                      shape=(gatrain.shape[0],4))
d = periodactivity.dropna(subset=['testrow'])
Xte_period = csr_matrix((np.ones(d.shape[0]), (d.testrow, d['period'])), 
                      shape=(gatest.shape[0],4))
print('Apps data: train shape {}, test shape {}'.format(Xtr_period.shape, Xte_period.shape))

Apps data: train shape (74645, 4), test shape (112071, 4)


In [46]:
d = periodactivity.dropna(subset=['trainrow'])
Xtr_period_size = csr_matrix((d['size'], (d.trainrow, d['period'])), 
                      shape=(gatrain.shape[0],4))
d = periodactivity.dropna(subset=['testrow'])
Xte_period_size = csr_matrix((d['size'], (d.testrow, d['period'])), 
                      shape=(gatest.shape[0],4))
print('Apps data: train shape {}, test shape {}'.format(Xtr_period_size.shape, Xte_period_size.shape))

Apps data: train shape (74645, 4), test shape (112071, 4)


In [47]:
d = periodactivity.dropna(subset=['trainrow'])
Xtr_period_act = csr_matrix((np.log1p(d['sum']), (d.trainrow, d['period'])), 
                      shape=(gatrain.shape[0],4))
d = periodactivity.dropna(subset=['testrow'])
Xte_period_act = csr_matrix((np.log1p(d['sum']), (d.testrow, d['period'])), 
                      shape=(gatest.shape[0],4))
print('Apps data: train shape {}, test shape {}'.format(Xtr_period_act.shape, Xte_period_act.shape))


Apps data: train shape (74645, 4), test shape (112071, 4)


In [48]:
d = periodactivity.dropna(subset=['trainrow'])
Xtr_period_act_m = csr_matrix((d['mean'], (d.trainrow, d['period'])), 
                      shape=(gatrain.shape[0],4))
d = periodactivity.dropna(subset=['testrow'])
Xte_period_act_m = csr_matrix((d['mean'], (d.testrow, d['period'])), 
                      shape=(gatest.shape[0],4))
print('Apps data: train shape {}, test shape {}'.format(Xtr_period_act_m.shape, Xte_period_act_m.shape))

Apps data: train shape (74645, 4), test shape (112071, 4)


In [49]:
del appactivity
del locactivity
del weekactivity
del periodactivity
del events

# App labels features

In [50]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

In [51]:
# del appevents
gatrain.head()

Unnamed: 0_level_0,gender,age,group,trainrow,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-8076087639492063270,M,35,M32-38,0,51,843
-2897161552818060146,M,35,M32-38,1,51,843
-8260683887967679142,M,35,M32-38,2,51,843
-4938849341048082022,M,30,M29-31,3,51,865
245133531816851882,M,30,M29-31,4,51,847


In [52]:
#devicelabels = (deviceapps[['device_id','app']]
#                .merge(applabels[['app','label']])
#                .groupby(['device_id','label'])['app'].agg(['size'])
#                .merge(gatrain[['trainrow']], how='left',  left_index=True, right_index=True)
#                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
#               .reset_index())
devicelabels = pd.read_csv(os.path.join(datadir,'devicelabels.csv'))
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,16,21594.0,
1,-9222956879900151005,120,17,21594.0,
2,-9222956879900151005,126,33,21594.0,
3,-9222956879900151005,138,59,21594.0,
4,-9222956879900151005,147,37,21594.0,


In [53]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


# Join all features

In [54]:
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label, Xtr_loc, Xtr_period, Xtr_loc_act_m, Xtr_period_act_m, Xtr_loc_act, Xtr_period_act), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label, Xte_loc, Xte_period, Xte_loc_act_m, Xte_period_act_m, Xte_loc_act, Xte_period_act), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 21644), test shape (112071, 21644)


In [None]:
Xtrain_log = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label, Xtr_loc, Xtr_period), format='csr')
Xtest_log =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label, Xte_loc, Xte_period), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

# Cross-validation

In [55]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)

In [56]:
params = {}
params['booster'] = 'gblinear'
params['objective'] = "multi:softprob"
params['eval_metric'] = 'mlogloss'
params['eta'] = 0.01
params['num_class'] = 12
params['lambda'] = 5
params['alpha'] = 3

In [57]:
parameters = {'C':0.02,'l1_ratio':1}

In [None]:
temp=np.random.random((10,2))
print(temp)
print(temp[[1,2,3,5],:]*2)

In [58]:
def score(parameters, random_state = 0,mix={'logit':0.475,'xgb':0.475,'sgd':0.05}):
    kf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=random_state)
    pred = np.zeros((y.shape[0],nclasses))
    pred_logistic = np.zeros((y.shape[0],nclasses))
    pred_xgb = np.zeros((y.shape[0],nclasses))
    pred_sgd = np.zeros((y.shape[0],nclasses))
    logit_val = False
    xgb_val = False
    sgd_val = False
    if mix['logit']>0:
        logit_val = True
    if mix['xgb']>0:
        xgb_val = True
    if mix['sgd']>0:
        sgd_val = True
    for itrain, itest in kf:
        Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :]
        #Xtr_log, Xte_log = Xtrain_log[itrain, :], Xtrain_log[itest, :]
        ytr, yte = y[itrain], y[itest]
        if logit_val:
            print("Starting Logistic Regression")
            # Logistic Regression
            clf1 = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
            clf1.fit(Xtr, ytr)
            pred_logistic[itest,:] = clf1.predict_proba(Xte)
        if xgb_val:
            print("Starting XGBoost")
            # XGBoost
            d_train = xgb.DMatrix(Xtr, label=ytr)
            d_valid = xgb.DMatrix(Xte, label=yte)
            watchlist = [(d_train, 'train'), (d_valid, 'eval')]
            clf2 = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=10,verbose_eval=False)
            pred_xgb[itest,:] = clf2.predict(d_valid)
        if sgd_val:
            # SGD Classifier
            print("SGD Classifier")
            clf3 = SGDClassifier(loss='log',penalty='elasticnet',l1_ratio=parameters['l1_ratio'])
            clf3.fit(Xtr, ytr)
            pred_sgd[itest,:] = clf3.predict_proba(Xte)
        # Downsize to one fold only for kernels
        # Combine predictions
        pred[itest,:] = (mix['logit']*pred_logistic[itest,:])+(mix['xgb']*pred_xgb[itest,:])+(mix['sgd']*pred_sgd[itest,:])
        #return log_loss(yte, pred[itest, :])
        print("Logistic {:.5f}, XGB {:.5f}, SGD {:.5f}, Average {:.5f}".format(log_loss(yte, pred_logistic[itest,:]),log_loss(yte, pred_xgb[itest,:]),log_loss(yte, pred_sgd[itest,:]),log_loss(yte, pred[itest,:])), end=' ')
        print('')
    print('')
    return log_loss(y, pred)

In [None]:
Cs = np.logspace(-3,0,4)
res = []
for C in Cs:
    res.append(score(LogisticRegression(C = C)))
plt.semilogx(Cs, res,'-o');

In [None]:
score(LogisticRegression(C=0.02))

In [None]:
score(parameters)

In [None]:
score(parameters,mix=0.75)

In [None]:
score(parameters)

In [59]:
score(parameters={'C':0.02,'l1_ratio':0.5},mix={'logit':0.475,'xgb':0.475,'sgd':0.05})

Starting Logistic Regression
Starting XGBoost
SGD Classifier
Logistic 2.27329, XGB 2.27374, SGD 2.37573, Average 2.27210 
Starting Logistic Regression
Starting XGBoost
SGD Classifier
Logistic 2.27080, XGB 2.27146, SGD 2.38552, Average 2.26953 
Starting Logistic Regression
Starting XGBoost
SGD Classifier
Logistic 2.27512, XGB 2.27390, SGD 2.34573, Average 2.27284 
Starting Logistic Regression
Starting XGBoost
SGD Classifier
Logistic 2.28328, XGB 2.28280, SGD 2.37032, Average 2.28088 
Starting Logistic Regression
Starting XGBoost
SGD Classifier
Logistic 2.26967, XGB 2.27043, SGD 2.38440, Average 2.26839 



2.2727463195609663

In [None]:
regularization = np.arange(0.5,0.7,0.05)
res = []
for l1_ratio in regularization:
    parameters['l1_ratio'] = l1_ratio
    res.append(score(parameters,mix={'logit':0,'xgb':0,'sgd':1}))
for l1_ratio,scores in zip(regularization,res):
    print("Regularization {:.2f}, Score {:.5f}".format(l1_ratio,scores))

In [60]:
score(parameters,mix={'logit':0.5,'xgb':0.5,'sgd':0})

Starting Logistic Regression
Starting XGBoost
Logistic 2.27329, XGB 2.27374, SGD 2.48491, Average 2.27145 
Starting Logistic Regression
Starting XGBoost
Logistic 2.27080, XGB 2.27146, SGD 2.48491, Average 2.26905 
Starting Logistic Regression
Starting XGBoost
Logistic 2.27512, XGB 2.27390, SGD 2.48491, Average 2.27243 
Starting Logistic Regression
Starting XGBoost
Logistic 2.28328, XGB 2.28280, SGD 2.48491, Average 2.28087 
Starting Logistic Regression
Starting XGBoost
Logistic 2.26967, XGB 2.27043, SGD 2.48491, Average 2.26802 



2.272362928276948

In [63]:
clf_logit = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs')
clf_logit.fit(Xtrain,y)
pred1 = clf_logit.predict_proba(Xtest)

In [68]:
d_train = xgb.DMatrix(Xtrain, label=y)
d_valid = xgb.DMatrix(Xtest)
watchlist = [(d_train, 'train')]
clf_xgb = xgb.train(params, d_train, 300, watchlist, early_stopping_rounds=10,verbose_eval=True)
pred2 = clf_xgb.predict(d_valid)

[0]	train-mlogloss:2.45478
Will train until train-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.43896
[2]	train-mlogloss:2.42769
[3]	train-mlogloss:2.41855
[4]	train-mlogloss:2.4106
[5]	train-mlogloss:2.40345
[6]	train-mlogloss:2.39691
[7]	train-mlogloss:2.39087
[8]	train-mlogloss:2.38524
[9]	train-mlogloss:2.37997
[10]	train-mlogloss:2.37503
[11]	train-mlogloss:2.37037
[12]	train-mlogloss:2.36596
[13]	train-mlogloss:2.36178
[14]	train-mlogloss:2.35781
[15]	train-mlogloss:2.35403
[16]	train-mlogloss:2.35042
[17]	train-mlogloss:2.34696
[18]	train-mlogloss:2.34365
[19]	train-mlogloss:2.34047
[20]	train-mlogloss:2.33741
[21]	train-mlogloss:2.33447
[22]	train-mlogloss:2.33163
[23]	train-mlogloss:2.32889
[24]	train-mlogloss:2.32625
[25]	train-mlogloss:2.32369
[26]	train-mlogloss:2.32121
[27]	train-mlogloss:2.31881
[28]	train-mlogloss:2.31648
[29]	train-mlogloss:2.31423
[30]	train-mlogloss:2.31203
[31]	train-mlogloss:2.3099
[32]	train-mlogloss:2.30783
[33]	train-mlogloss:2.3058

In [69]:
pred = pred1*0.5 + pred2*0.5
pred = pd.DataFrame(pred, index = gatest.index, columns=targetencoder.classes_)
pred.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.001288,0.005684,0.014889,0.013464,0.024674,0.048528,0.013842,0.03281,0.074325,0.145433,0.253815,0.371249
-1547860181818787117,0.011055,0.020915,0.035328,0.062201,0.068734,0.128098,0.006254,0.099319,0.06129,0.095822,0.211275,0.199708
7374582448058474277,0.032257,0.038824,0.041279,0.156426,0.174831,0.075753,0.014544,0.026717,0.044526,0.097892,0.18114,0.115812
-6220210354783429585,0.002907,0.033505,0.008684,0.015124,0.053723,0.145763,0.056518,0.165298,0.067444,0.100733,0.169735,0.180567
-5893464122623104785,0.045511,0.067104,0.0442,0.064296,0.055909,0.043361,0.085643,0.165302,0.100349,0.100266,0.135133,0.092926


In [70]:
pred.to_csv('logit_xgb_subm.csv',index=True)