In [259]:
import pandas as pd
import numpy as np
import xgboost as xgb
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')

gatrain['row'] = np.arange(gatrain.shape[0])
gatest['row'] = np.arange(gatest.shape[0])

brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']

m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']

gatrain['nid_order'] = gatrain.row/74645.0
gatest['nid_order'] = gatest.row/112071.0

gatrain['to_group'] = gatrain.gender.astype(str)+'.'+gatrain.age.astype(str)+'.'+gatrain.brand.astype(str)+'.'+gatrain.model.astype(str)

gatrain['to_group_bm'] = gatrain.brand.astype(str)+'.'+gatrain.model.astype(str)
gatest['to_group_bm'] = gatest.brand.astype(str)+'.'+gatest.model.astype(str)

tr_groupby_bm = gatrain.groupby('to_group_bm')
te_groupby_bm = gatest.groupby('to_group_bm')

grpp_bm_unique = list(set(gatrain.to_group_bm.unique())&set(gatest.to_group_bm.unique()))

def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

final_te_gender = pd.Series('X',index=gatest.index)
final_te_age = pd.Series(0,index=gatest.index)
final_te_grp = pd.Series('Y',index=gatest.index)

for grpp_bm in grpp_bm_unique:
    df_tr_grpp_bm = tr_groupby_bm.get_group(grpp_bm)
    df_te_grpp_bm = te_groupby_bm.get_group(grpp_bm)
    tr_groupby_all = df_tr_grpp_bm.groupby('to_group')
    for grpp_all in df_tr_grpp_bm.to_group.unique():
        df_tr_grpp_all = tr_groupby_all.get_group(grpp_all)
        b = consecutive(df_tr_grpp_all.row)
        tr_consec = []
        for a in b:
            if a.shape[0]>1:
                tr_consec.append(a)
        for i in range(len(tr_consec)):
            df_tr_smallest = df_tr_grpp_all.ix[tr_consec[i].index]
            meann = df_tr_smallest.nid_order.mean()
            gndr = df_tr_smallest.gender.unique()[0]
            agee = df_tr_smallest.age.unique()[0]
            target_grp = df_tr_smallest.group.unique()[0]
            l_cnd = meann - 0.001
            h_cnd = meann + 0.001
            df_te_small = df_te_grpp_bm[(df_te_grpp_bm.nid_order>l_cnd)&(df_te_grpp_bm.nid_order<h_cnd)].sort('row')
            g = consecutive(df_te_small.row)
            te_consec = []
            for f in g:
                if f.shape[0]>2:
                    te_consec.append(f)
            minn = []
            for j in range(len(te_consec)):
                df_te_smallest = df_te_small.ix[te_consec[j].index]
                meann_te = df_te_smallest.nid_order.mean()
                minn.append(meann_te)
            if len(minn)>0:
                indd = minn.index(min(minn))
                df_te_smallest_final = df_te_small.ix[te_consec[indd].index]
                final_te_gender[df_te_smallest_final.index] = gndr
                final_te_age[df_te_smallest_final.index] = agee
                final_te_grp[df_te_smallest_final.index] = target_grp



In [260]:
(final_te_age>0).sum()

3498

In [285]:
preds = pd.read_csv(os.path.join(datadir,'sub_nn.csv'),
                      index_col='device_id')

In [286]:
final_te = pd.concat([final_te_gender, final_te_age, final_te_grp], axis=1)

In [287]:
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)

In [288]:
targetencoder.classes_

array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object)

In [289]:
calcs = final_te[final_te[0]!='X']

In [290]:
calcs.drop([0,1],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [291]:
calcs.columns = ['te_group']

In [292]:
calcs.shape

(3498, 1)

In [293]:
aaa = np.zeros((calcs.shape[0],12))

In [294]:
abb = pd.DataFrame(aaa,index=calcs.index,columns=targetencoder.classes_)

In [295]:
abb.columns

Index(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'],
      dtype='object')

In [296]:
lst = abb.columns.tolist()

In [297]:
for grpp in calcs.te_group.unique():
    a = calcs.groupby('te_group').get_group(grpp)
    b = pd.Series(0.36/11,index=abb.index)
    b[a.index] = 0.64
    ac = abb.drop([grpp],axis=1)
    fin = pd.concat([ac,b],axis=1)
    fin = fin.rename(columns = {0:grpp})
    abb = fin[lst]

In [298]:
abb.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-2923586054882771834,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.64,0.032727
6954091558526049914,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.64,0.032727
5872285898854951798,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.64,0.032727
-3593469878585100,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.64,0.032727
4231882985343693248,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.032727,0.64,0.032727,0.032727,0.032727


In [299]:
ssss = list(set(preds.index)-set(abb.index))

In [300]:
no_calcs = preds.ix[ssss]

In [301]:
pred = no_calcs.append(abb)

In [302]:
pred.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-7249086381035094010,0.085533,0.059336,0.040886,0.057198,0.066309,0.051006,0.139732,0.142341,0.068471,0.087911,0.106545,0.094732
5421829656841551877,0.021293,0.03849,0.033813,0.04862,0.043741,0.032427,0.066179,0.195513,0.139519,0.159547,0.148544,0.072314
-5862574739713163254,0.000473,0.003572,0.006406,0.02308,0.034201,0.017633,0.002894,0.044494,0.095608,0.21937,0.429108,0.12316
3755622189044858888,0.079022,0.064634,0.047576,0.067438,0.077539,0.062125,0.109404,0.127062,0.069683,0.088174,0.10998,0.097363
-2635910126289027061,0.068133,0.064781,0.044728,0.054448,0.046258,0.032755,0.14857,0.188836,0.095844,0.103673,0.096057,0.055918


In [303]:
pred_fin = pred.reindex(gatest.index)

In [304]:
pred_fin.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,8.5e-05,0.000334,0.000893,0.004595,0.025983,0.055502,0.000651,0.009696,0.012555,0.046925,0.208794,0.633987
-1547860181818787117,0.002245,0.007434,0.014363,0.043093,0.1045,0.140147,0.002643,0.024815,0.036756,0.080273,0.201748,0.341982
7374582448058474277,0.011884,0.03573,0.043454,0.118219,0.147204,0.068086,0.008886,0.043222,0.060531,0.11716,0.21957,0.126052
-6220210354783429585,0.003221,0.007278,0.007807,0.015015,0.026515,0.037776,0.033981,0.162003,0.100991,0.174032,0.233889,0.197494
-5893464122623104785,0.036345,0.051801,0.044587,0.066842,0.068415,0.053703,0.074669,0.142736,0.09717,0.125785,0.138205,0.099743


In [305]:
gatest.head()

Unnamed: 0_level_0,row,brand,model,nid_order,to_group_bm
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1002079943728939269,0,51,857,0.0,51.857
-1547860181818787117,1,51,860,9e-06,51.86
7374582448058474277,2,31,717,1.8e-05,31.717
-6220210354783429585,3,31,735,2.7e-05,31.735
-5893464122623104785,4,51,843,3.6e-05,51.843


In [306]:
pred_fin.to_csv('sub5.csv',index=True)

In [307]:
!zip sub5.zip sub5.csv

  adding: sub5.csv (deflated 56%)
