In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss



In [2]:
datadir = '../input'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [4]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [5]:
a = pd.DataFrame(Xtr_brand.toarray())

In [6]:
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,121,122,123,124,125,126,127,128,129,130
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
gatrain.brand.unique().shape

(120,)

In [8]:
events.device_id.unique().shape

(60865,)

In [9]:
events.shape

(3252950, 4)

In [10]:
events.device_id.value_counts().head()

 1186608308763918427    33426
 3915082290673137129    14568
-1656894751624916732     6731
-6242501228649113250     4150
-8340098378141155823     3973
Name: device_id, dtype: int64

In [11]:
gatest.ix[1186608308763918427]

testrow    102800
brand          51
Name: 1186608308763918427, dtype: int64

In [12]:
gatest.ix[3915082290673137129]

testrow    40536
brand         15
Name: 3915082290673137129, dtype: int64

In [13]:
gatrain.shape[0]-len(set(gatrain.index)-set(events.device_id.unique()))

23309

In [14]:
len(set(gatrain.index)-set(events.device_id.unique()))

51336

In [15]:
23309/51336

0.45404784167056256

0.45404 = ratio of device_ids with events to no_events in train

In [16]:
gatest.shape[0]-len(set(gatest.index)-set(events.device_id.unique()))

35194

In [17]:
len(set(gatest.index)-set(events.device_id.unique()))

76877

In [18]:
35194/76877

0.4577962199357415

0.45779 = ratio of device_ids with events to no_events in test...almost similar ratio..so 23309 predicts 35194 and 51336 predicts 76877

In [19]:
events.device_id.unique().shape

(60865,)

In [25]:
len(set(gatrain.index) | set(events.device_id.unique()))

112201

112201 = sum of 73645+35194+2362

In [24]:
23309+35194

58503

58503 - total unique ids in events.csv which are meaningful

In [33]:
events.device_id.unique().shape[0]-len(set(events.device_id.unique())&(set(gatrain.index) | set(gatest.index)))

2362

2362 unique device_id's in events.csv are not in train or test..so events corresponding to these device_id's can be removed from events.csv

In [38]:
events_tr = list(set(events.device_id.unique())&set(gatrain.index))

In [43]:
gatrain.ix[events_tr].isnull().sum()

gender      0
age         0
group       0
trainrow    0
brand       0
dtype: int64

In [50]:
events_train = events.loc[events['device_id'].isin(events_tr)]

In [67]:
events_train.device_id.value_counts().head(10)

-6242501228649113250    4150
-8340098378141155823    3973
-3746248670824158209    3907
 5375599021847302819    3128
 4782582047729166353    2899
 1779631023439405334    2757
 5098778421671837341    2722
 3724654925765159056    2347
-6875585507485886098    2310
 6356179019102873408    2023
Name: device_id, dtype: int64

above is count of no. of events for each device_ids in train which have events(those 23309)

In [57]:
events_te = list(set(events.device_id.unique())&set(gatest.index))


In [58]:
gatest.ix[events_te].isnull().sum()

testrow    0
brand      0
dtype: int64

In [59]:
events_test = events.loc[events['device_id'].isin(events_te)]

In [66]:
events_test.device_id.value_counts().head(10)

 1186608308763918427    33426
 3915082290673137129    14568
 2504414082456157897     3804
-3037377082444295812     3534
-17299534936664237       3506
-7369693784883843916     3446
 8339429008953975436     3440
 5083019926611946481     3395
-5105332332397066846     3099
 2771516290634663640     2975
Name: device_id, dtype: int64

above is count of no. of events for each device_ids in test which have events(those 35194)

In [104]:
events_test.device_id.value_counts().quantile(0.99)

531.06999999999971

In [105]:
events_train.device_id.value_counts().quantile(0.99)

516.91999999999825

In [114]:
events_train.device_id.value_counts().quantile(0.11)

2.0

(0.99-0.11) = 88% device_id's in train have count of event_id's between 2 to 517..
similarly for test

In [110]:
events_train.device_id.value_counts().mean()

52.15131494272599

In [111]:
events_test.device_id.value_counts().mean()

55.26004432573734