In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [4]:
crosstab = load_sparse_csr('crosstab_device_appid_8.npz')

In [5]:
df = pd.DataFrame(crosstab.toarray())

In [6]:
cols = pd.read_csv('cols.csv')
rows = pd.read_csv('rows.csv')

In [7]:
cols.columns = ['index','device_id']

In [8]:
rows.columns = ['index','app_id']

In [9]:
df.index = pd.Series(rows['app_id'])

In [10]:
df.columns = pd.Series(cols['device_id'])

In [11]:
df['summ'] = df.sum(axis=1)

In [12]:
df_nonzero = df[df.summ>0]

In [13]:
df_nonzero.shape

(7825, 60823)

In [14]:
df_nonzero.drop(['summ'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
dft = df_nonzero.T

In [16]:
dft.head()

app_id,-9217104312935103667,-9216716044975227433,-9216547119863430601,-9215674982339481470,-9210372544235257540,-9208020732558900907,-9203112368544435271,-9198959992801690764,-9198654150634969665,-9192640465934207713,...,9208471355518556004,9211954271534258460,9212661076525577612,9212661076547991325,9212661076561479863,9212711158326654461,9214703919715811114,9217386935599579042,9220205176760015004,9222488106573038706
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222956879900151005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9222661944218806987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9222399302879214035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9221825537663503111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9221767098072603291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
gatrain.head()

Unnamed: 0_level_0,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38
-4938849341048082022,M,30,M29-31
245133531816851882,M,30,M29-31


In [18]:
merged_df_tr = gatrain.merge(dft,how='left',left_index=True,right_index=True)
merged_df_te = gatest.merge(dft,how='left',left_index=True,right_index=True)

In [27]:
#merged_df_te.shape[0] - merged_df_te.dropna().shape[0]

In [25]:
merged_df_tr.fillna(0,inplace=True)
merged_df_te.fillna(0,inplace=True)

In [35]:
Xtr_app = csr_matrix(merged_df_tr.ix[:,3:])
Xte_app = csr_matrix(merged_df_te)

In [39]:
Xtr_app.shape,Xte_app.shape

((74645, 7825), (112071, 7825))

In [40]:
save_sparse_csr('xtr_app',Xtr_app)
save_sparse_csr('xte_app',Xte_app)