In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_installed','is_active'],
                        dtype={'is_active':bool,'is_installed':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
#appevents.event_id.value_counts()

In [4]:
appevents.event_id.unique().shape

(1488096,)

In [5]:
events.shape

(3252950, 4)

In [6]:
3252950.0/1488096

2.1859812807775842

In [7]:
events.device_id.unique().shape

(60865,)

In [8]:
gatrain.shape

(74645, 3)

In [9]:
gatest.shape

(112071, 0)

In [10]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [13]:
gatrain.head()

Unnamed: 0_level_0,gender,age,group,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8076087639492063270,M,35,M32-38,0
-2897161552818060146,M,35,M32-38,1
-8260683887967679142,M,35,M32-38,2
-4938849341048082022,M,30,M29-31,3
245133531816851882,M,30,M29-31,4


In [25]:
phone.head()

Unnamed: 0_level_0,phone_brand,device_model,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15
3669464369358936369,SUGAR,时尚手机,9
-5019277647504317457,三星,Galaxy Note 2,15


In [15]:
brandencoder = LabelEncoder().fit(phone.phone_brand)

In [17]:
phone['brand'] = brandencoder.transform(phone['phone_brand'])

In [23]:
112071+74645

186716

In [26]:
phone.phone_brand.unique()

array(['小米', '三星', 'SUGAR', '华为', 'vivo', '魅族', '酷派', '天语', 'OPPO', '中兴',
       '金立', '联想', '海信', '索尼', 'LG', 'HTC', '酷比', '康佳', '奇酷', '欧博信', '欧比',
       'TCL', '爱派尔', '努比亚', '优米', 'LOGO', '朵唯', '黑米', '锤子', '酷比魔方', '美图',
       '尼比鲁', '一加', '优购', '诺基亚', '糖葫芦', '中国移动', '语信', '基伍', '青橙', '华硕',
       '夏新', '维图', '艾优尼', '摩托罗拉', '乡米', '米奇', '大可乐', '沃普丰', '神舟', '摩乐',
       '飞秒', '米歌', '富可视', '德赛', '梦米', '乐视', '小杨树', '纽曼', '邦华', 'E派', '普耐尔',
       '易派', '欧新', '西米', '海尔', '波导', '糯米', '唯米', '酷珀', '谷歌', 'ZUK', '亿通',
       '昂达', '聆韵', '金星数码', '广信', '至尊宝', '百立丰', '诺亚信', '欧奇', '贝尔丰', 'MIL',
       '斐讯', '优语', 'Lovme', '白米', '百加', '宝捷讯', '果米', '首云', '瑞米', '瑞高',
       '台电', '丰米', '唯比', '长虹', '大Q', '鲜米', '先锋', '恒宇丰', '虾米', '凯利通', '青葱',
       '奥克斯', '蓝魔', '智镁', '飞利浦', '西门子', '惠普', '本为', '欧沃', '欧乐迪', 'PPTV',
       '赛博宇华', 'E人E本', '大显', '帷幄', '世纪星', '德卡诺', 'ZOYE', '宏碁', '戴尔', '嘉源',
       '欧乐酷', '碟米', '天宏时代', '世纪天元', '极米', '原点', '亚马逊'], dtype=object)

In [37]:
set(phone.loc[gatest.index].brand.unique())-set(phone.loc[gatrain.index].brand.unique())

{0, 17, 37, 47, 53, 56, 70, 79, 86, 90, 113}

In [38]:
set(phone.loc[gatest.index].phone_brand.unique())-set(phone.loc[gatrain.index].phone_brand.unique())

{'E人E本', '世纪星', '嘉源', '宏碁', '帷幄', '德卡诺', '极米', '欧沃', '瑞高', '碟米', '赛博宇华'}

In [40]:
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


In [45]:
np.ones(gatrain.shape[0])

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])