In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [4]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [5]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [6]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)

In [56]:
events.head()

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [54]:
appevents.event_id.unique()

array([      2,       6,       7, ..., 3252937, 3252946, 3252948])

In [51]:
appevents.event_id.unique().shape

(1488096,)

In [12]:
appevents.head(2)

Unnamed: 0,event_id,app_id,is_active,app
0,2,5927333115845830913,True,15408
1,2,-5720078949152207372,False,3384


In [67]:
events.head(2)

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


In [57]:
a = appevents.merge(events[['device_id','timestamp','longitude','latitude']], how='left',left_on='event_id',right_index=True)

In [63]:
a.isnull().sum()

event_id     0
app_id       0
is_active    0
app          0
device_id    0
timestamp    0
longitude    0
latitude     0
dtype: int64

In [90]:
a.head()

Unnamed: 0,event_id,app_id,is_active,app,device_id,timestamp,longitude,latitude
0,2,5927333115845830913,True,15408,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
1,2,-5720078949152207372,False,3384,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,2,-1633887856876571208,False,7620,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,2,-653184325010919369,True,8902,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
4,2,8693964245073640147,True,18686,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


In [33]:
b = a[a.device_id==-3037377082444295812]

In [35]:
c = b.sort_values('app_id')

In [41]:
c[c.app_id==-9050100410106163077]

Unnamed: 0,event_id,app_id,is_active,app,device_id
8360639,837361,-9050100410106163077,False,183,-3037377082444295812
23223989,2322618,-9050100410106163077,False,183,-3037377082444295812
17083400,1709592,-9050100410106163077,False,183,-3037377082444295812
21635050,2167973,-9050100410106163077,False,183,-3037377082444295812
8266603,830320,-9050100410106163077,False,183,-3037377082444295812
13319965,1334197,-9050100410106163077,False,183,-3037377082444295812
2614476,261753,-9050100410106163077,False,183,-3037377082444295812
26847066,2689605,-9050100410106163077,False,183,-3037377082444295812
20045174,2005301,-9050100410106163077,False,183,-3037377082444295812
29161505,2920700,-9050100410106163077,False,183,-3037377082444295812


In [169]:
applabels[applabels.app_id==5927333115845830913]

Unnamed: 0,app_id,label_id
3152,5927333115845830913,549
8139,5927333115845830913,710
69351,5927333115845830913,704
187954,5927333115845830913,548
430991,5927333115845830913,172


In [109]:
d = applabels.groupby(['app_id'])['label_id'].unique()

In [111]:
pd.DataFrame(d)

Unnamed: 0_level_0,label_id
app_id,Unnamed: 1_level_1
-9223281467940916832,"[796, 795, 794, 405]"
-9222877069545393219,[135]
-9222785464897897681,"[812, 795, 794, 405]"
-9222198347540756780,"[810, 795, 794, 405]"
-9221970424041518544,"[714, 704, 548, 813, 795, 794, 405]"
-9221828329335715761,[971]
-9221379430160197477,"[711, 714, 548, 704, 801, 795, 794, 405]"
-9221345089971749240,"[800, 795, 794, 405]"
-9221156934682287334,[130]
-9220899153371182692,"[857, 874, 548, 854]"


In [114]:
label_cat = pd.read_csv(os.path.join(datadir,'label_categories.csv'))

In [118]:
labelsss = applabels.merge(label_cat,how='left',on='label_id')

In [178]:
labelsss[labelsss.label_id==704].head()

Unnamed: 0,app_id,label_id,category
69156,9058148894318981231,704,Property Industry 2.0
69157,-5010107147746880097,704,Property Industry 2.0
69158,3520983770777613287,704,Property Industry 2.0
69159,3317077434980644433,704,Property Industry 2.0
69160,9001857150917770908,704,Property Industry 2.0


In [None]:
#[Cards RPG, game, Tencent, Custom label]
#[796, 795, 794, 405]

In [156]:
label_cat[label_cat.label_id==714]

Unnamed: 0,label_id,category
639,714,1 free


In [127]:
id_grp = pd.DataFrame(labelsss.groupby(['app_id'])['label_id'].unique())

In [139]:
category_grp = pd.DataFrame(labelsss.groupby(['app_id'])['category'].unique())

In [165]:
id_grp.head()

Unnamed: 0_level_0,label_id,category,labels_count
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9223281467940916832,"[796, 795, 794, 405]","[Cards RPG, game, Tencent, Custom label]",4
-9222877069545393219,[135],[education outside class],1
-9222785464897897681,"[812, 795, 794, 405]","[Parkour avoid class, game, Tencent, Custom la...",4
-9222198347540756780,"[810, 795, 794, 405]","[Casual puzzle categories, game, Tencent, Cust...",4
-9221970424041518544,"[714, 704, 548, 813, 795, 794, 405]","[1 free, Property Industry 2.0, Industry tag, ...",7


In [148]:
category_grp.head()

Unnamed: 0_level_0,category
app_id,Unnamed: 1_level_1
-9223281467940916832,"[Cards RPG, game, Tencent, Custom label]"
-9222877069545393219,[education outside class]
-9222785464897897681,"[Parkour avoid class, game, Tencent, Custom la..."
-9222198347540756780,"[Casual puzzle categories, game, Tencent, Cust..."
-9221970424041518544,"[1 free, Property Industry 2.0, Industry tag, ..."


In [153]:
category_grp.ix[-9221970424041518544]

category    [1 free, Property Industry 2.0, Industry tag, ...
Name: -9221970424041518544, dtype: object

In [152]:
pd.options.display.max_colwidth = 50

In [154]:
id_grp['category'] = category_grp['category']

In [162]:
cnt = pd.DataFrame(labelsss.groupby(['app_id'])['label_id'].agg(['count']))

In [167]:
id_grp.head()

Unnamed: 0_level_0,label_id,category,labels_count
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9223281467940916832,"[796, 795, 794, 405]","[Cards RPG, game, Tencent, Custom label]",4
-9222877069545393219,[135],[education outside class],1
-9222785464897897681,"[812, 795, 794, 405]","[Parkour avoid class, game, Tencent, Custom la...",4
-9222198347540756780,"[810, 795, 794, 405]","[Casual puzzle categories, game, Tencent, Cust...",4
-9221970424041518544,"[714, 704, 548, 813, 795, 794, 405]","[1 free, Property Industry 2.0, Industry tag, ...",7


In [164]:
id_grp['labels_count'] = cnt['count']

In [166]:
a.head()

Unnamed: 0,event_id,app_id,is_active,app,device_id,timestamp,longitude,latitude
0,2,5927333115845830913,True,15408,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
1,2,-5720078949152207372,False,3384,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,2,-1633887856876571208,False,7620,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,2,-653184325010919369,True,8902,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
4,2,8693964245073640147,True,18686,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97


In [173]:
df_final = a.merge(id_grp[['label_id','category','labels_count']], how='left',left_on='app_id',right_index=True)

In [172]:
appevents.shape

(32473067, 3)

In [174]:
df_final.head()

Unnamed: 0,event_id,app_id,is_active,app,device_id,timestamp,longitude,latitude,label_id,category,labels_count
0,2,5927333115845830913,True,15408,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,"[549, 710, 704, 548, 172]","[Property Industry 1.0, Relatives 1, Property ...",5
1,2,-5720078949152207372,False,3384,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,"[721, 704, 548, 302, 303]","[Personal Effectiveness 1, Property Industry 2...",5
2,2,-1633887856876571208,False,7620,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,"[251, 263, 306, 302, 405, 730, 756, 757, 775, ...","[Finance, Debit and credit, unknown, Custom la...",12
3,2,-653184325010919369,True,8902,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,"[405, 730, 747, 749, 776, 782, 785, 255, 251, ...","[Custom label, And the Church, Insurance, Lowe...",10
4,2,8693964245073640147,True,18686,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,"[549, 710, 704, 548, 172]","[Property Industry 1.0, Relatives 1, Property ...",5


In [180]:
df_final.to_hdf('events_data.hdf5','table')

MemoryError: 