In [5]:
import numpy as np
import pandas as pd


from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

import os
from scipy.sparse import csr_matrix, hstack

In [10]:
seed = 7
np.random.seed(seed)

datadir = '..'

In [11]:
gatrain = pd.read_csv('gender_age_train.csv',index_col='device_id')
gatest = pd.read_csv('gender_age_test.csv',index_col='device_id')
phone = pd.read_csv('phone_brand_device_model.csv')

In [13]:
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')

In [14]:
events = pd.read_csv('events.csv',  parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv('app_events.csv', usecols=['event_id','app_id','is_active'], dtype={'is_active':bool})

In [16]:
applabels = pd.read_csv('app_labels.csv')

In [20]:
gatrain.head(5)

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [21]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [22]:
gatrain.head(5)

Unnamed: 0_level_0,gender,age,group,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8076087639492063270,M,35,M32-38,0
-2897161552818060146,M,35,M32-38,1
-8260683887967679142,M,35,M32-38,2
-4938849341048082022,M,30,M29-31,3
245133531816851882,M,30,M29-31,4


In [23]:
brandencoder = LabelEncoder().fit(phone.phone_brand)

In [31]:
phone.head(5)

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,小米,红米
1277779817574759137,小米,MI 2
5137427614288105724,三星,Galaxy S4
3669464369358936369,SUGAR,时尚手机
-5019277647504317457,三星,Galaxy Note 2


In [32]:
phone['brand'] = brandencoder.transform(phone['phone_brand'])
phone.head(5)

Unnamed: 0_level_0,phone_brand,device_model,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15
3669464369358936369,SUGAR,时尚手机,9
-5019277647504317457,三星,Galaxy Note 2,15


In [33]:
gatrain.head(5)

Unnamed: 0_level_0,gender,age,group,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8076087639492063270,M,35,M32-38,0
-2897161552818060146,M,35,M32-38,1
-8260683887967679142,M,35,M32-38,2
-4938849341048082022,M,30,M29-31,3
245133531816851882,M,30,M29-31,4


In [34]:
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
gatrain.head(5)

Unnamed: 0_level_0,gender,age,group,trainrow,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-8076087639492063270,M,35,M32-38,0,51
-2897161552818060146,M,35,M32-38,1,51
-8260683887967679142,M,35,M32-38,2,51
-4938849341048082022,M,30,M29-31,3,51
245133531816851882,M,30,M29-31,4,51


In [26]:
y = letarget.transform(gatrain.group.values)

In [27]:
n_classes = len(letarget.classes_)
pred = np.ones((gatrain.shape[0],n_classes))/n_classes
log_loss(y, pred)

2.4849066497880012

In [35]:
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, gatest.brand)))

In [43]:
print(Xtr_brand.get_shape())
print(Xte_brand.get_shape())

(74645, 131)
(112071, 131)


In [44]:
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [45]:
phone.head(5)

Unnamed: 0_level_0,phone_brand,device_model,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15
3669464369358936369,SUGAR,时尚手机,9
-5019277647504317457,三星,Galaxy Note 2,15


In [46]:
m = phone.phone_brand.str.cat(phone.device_model)
m.head(5)

device_id
-8890648629457979026               小米红米
 1277779817574759137             小米MI 2
 5137427614288105724        三星Galaxy S4
 3669464369358936369          SUGAR时尚手机
-5019277647504317457    三星Galaxy Note 2
Name: phone_brand, dtype: object

In [48]:
modelencoder = LabelEncoder().fit(m)
phone.head(5)

Unnamed: 0_level_0,phone_brand,device_model,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15
3669464369358936369,SUGAR,时尚手机,9
-5019277647504317457,三星,Galaxy Note 2,15


In [50]:
phone['model'] = modelencoder.transform(m)
phone.head(5)

Unnamed: 0_level_0,phone_brand,device_model,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8890648629457979026,小米,红米,51,858
1277779817574759137,小米,MI 2,51,843
5137427614288105724,三星,Galaxy S4,15,371
3669464369358936369,SUGAR,时尚手机,9,166
-5019277647504317457,三星,Galaxy Note 2,15,347


In [51]:
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]),
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]),
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [52]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents.head(5)

Unnamed: 0,event_id,app_id,is_active
0,2,5927333115845830913,True
1,2,-5720078949152207372,False
2,2,-1633887856876571208,False
3,2,-653184325010919369,True
4,2,8693964245073640147,True


In [53]:
appevents['app'] = appencoder.transform(appevents.app_id)
appevents.head(5)

Unnamed: 0,event_id,app_id,is_active,app
0,2,5927333115845830913,True,15408
1,2,-5720078949152207372,False,3384
2,2,-1633887856876571208,False,7620
3,2,-653184325010919369,True,8902
4,2,8693964245073640147,True,18686


In [54]:
napps = len(appencoder.classes_)
print(napps)

19237


In [79]:
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,21594.0,
1,-9222956879900151005,1096,18,21594.0,
2,-9222956879900151005,1248,26,21594.0,
3,-9222956879900151005,1545,12,21594.0,
4,-9222956879900151005,1664,18,21594.0,


In [82]:
print("Just capturing the number of occurences")
deviceapps.device_id.isin([-9222956879900151005]).sum()

Just capturing the number of occurences


72

In [86]:
deviceapps.shape

(2369025, 5)

In [84]:
d = deviceapps.dropna(subset=['trainrow'])
d.shape

(915632, 5)

In [87]:
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [88]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [90]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,21594.0,
1,-9222956879900151005,120,1,21594.0,
2,-9222956879900151005,126,1,21594.0,
3,-9222956879900151005,138,2,21594.0,
4,-9222956879900151005,147,2,21594.0,


In [91]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                      shape=(gatest.shape[0],nlabels))

In [92]:
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))


# Concatenate all features

Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)
All features: train shape (74645, 21527), test shape (112071, 21527)


In [93]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)
dummy_y = np_utils.to_categorical(y)

NameError: name 'np_utils' is not defined