In [1]:
# coding=utf8
# Based on yibo's R script

import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import log_loss

In [2]:
NROWS = 10000

In [3]:
app_ev = pd.read_csv("./input/app_events.csv",
                     dtype={'device_id': np.str},
                     nrows=NROWS
                    )
# concatenate app_ids for each event_id
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

In [4]:
app_ev.head()

event_id
2     app_id:8693964245073640147 app_id:434865995276...
6     app_id:8693964245073640147 app_id:175704400052...
7     app_id:8693964245073640147 app_id:-10140063694...
9     app_id:8693964245073640147 app_id:-51663306906...
16    app_id:628020936226491308 app_id:-737700447902...
Name: app_id, dtype: object

In [5]:
events = pd.read_csv("./input/events.csv",
                    dtype={'device_id': np.str},
                    nrows=NROWS
                    )
events["app_id"] = events["event_id"].map(app_ev)

In [6]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,app_id:8693964245073640147 app_id:434865995276...
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,


In [7]:
events = events.dropna()

In [8]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,app_id:8693964245073640147 app_id:434865995276...
5,6,1476664663289716375,2016-05-01 00:27:21,0.0,0.0,app_id:8693964245073640147 app_id:175704400052...
6,7,5990807147117726237,2016-05-01 00:15:13,113.73,23.0,app_id:8693964245073640147 app_id:-10140063694...
8,9,-2073340001552902943,2016-05-01 00:15:33,0.0,0.0,app_id:8693964245073640147 app_id:-51663306906...
15,16,9070651185984875886,2016-05-01 00:06:06,0.0,0.0,app_id:628020936226491308 app_id:-737700447902...


In [9]:
del app_ev

In [10]:
events = events[["device_id", "app_id"]]

In [11]:
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

In [12]:
events.head()

Unnamed: 0,device_id,app_id
0,-1084174363886138500,app_id:8693964245073640147 app_id:-32034847784...
1,-1153910750523975031,app_id:8693964245073640147 app_id:628020936226...
2,-1340161279060514090,app_id:7971327257373737575 app_id:874818657222...
3,-1558314595648377026,app_id:8693964245073640147 app_id:-51663306906...
4,-1593861387409811850,app_id:8693964245073640147 app_id:893729596138...


In [15]:
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

In [16]:
events.head()

Unnamed: 0,app_id,device_id
0,app_id:8693964245073640147,-1084174363886138500
1,app_id:-3203484778499260135,-1084174363886138500
2,app_id:7971327257373737575,-1084174363886138500
3,app_id:3364032031731117644,-1084174363886138500
4,app_id:-7377004479023402858,-1084174363886138500


In [17]:
pbd = pd.read_csv("./input/phone_brand_device_model.csv",
                  dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)

In [18]:
pbd.head()

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [20]:
train = pd.read_csv("./input/gender_age_train.csv",
                    dtype={'device_id': np.str})
train.drop(["age", "gender"], axis=1, inplace=True)

In [21]:
train.head()

Unnamed: 0,device_id,group
0,-8076087639492063270,M32-38
1,-2897161552818060146,M32-38
2,-8260683887967679142,M32-38
3,-4938849341048082022,M29-31
4,245133531816851882,M29-31


In [22]:
test = pd.read_csv("./input/gender_age_test.csv",
                   dtype={'device_id': np.str})
test["group"] = np.nan
test.head()

Unnamed: 0,device_id,group
0,1002079943728939269,
1,-1547860181818787117,
2,7374582448058474277,
3,-6220210354783429585,
4,-5893464122623104785,


In [23]:
split_len = len(train)

In [24]:
# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

In [25]:
Df = pd.concat((train, test), axis=0, ignore_index=True)

In [26]:
Df.head()

Unnamed: 0,device_id,group
0,-8076087639492063270,M32-38
1,-2897161552818060146,M32-38
2,-8260683887967679142,M32-38
3,-4938849341048082022,M29-31
4,245133531816851882,M29-31


In [27]:
Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))

In [29]:
Df.head()

Unnamed: 0,device_id,group,phone_brand,device_model
0,-8076087639492063270,M32-38,phone_brand:小米,device_model:MI 2
1,-2897161552818060146,M32-38,phone_brand:小米,device_model:MI 2
2,-8260683887967679142,M32-38,phone_brand:小米,device_model:MI 2
3,-4938849341048082022,M29-31,phone_brand:小米,device_model:红米note
4,245133531816851882,M29-31,phone_brand:小米,device_model:MI 3


In [31]:
f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

In [32]:
del Df

In [33]:

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)

In [34]:
FLS.sample(10)

Unnamed: 0,device_id,feature
290303,1478289665726762368,device_model:Galaxy S3
183449,-1740163140345439817,phone_brand:华为
27898,3357290046875428650,phone_brand:OPPO
252040,8534949981820422389,device_model:Galaxy S3
144834,7194221327598975046,phone_brand:三星
339883,8703457607761181981,device_model:大神X7
365041,-4552617722880659442,device_model:全魔王手机
325868,-5478550138250468036,device_model:Ascend P7
249639,-2587207702144661161,device_model:Galaxy Note 3
89730,8414830126196442859,phone_brand:小米


In [35]:
device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

In [36]:
data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

In [40]:
sparse_matrix.getnnz(0)

array([    1,     1,     1, ..., 11816,     3,    86])

In [41]:
train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)

In [40]:
sparse_matrix.getnnz(0) >0

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [43]:
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)



In [44]:
# number of features
X_train.shape[1]

863

In [46]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

In [47]:
params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.07,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist,
                early_stopping_rounds=25, verbose_eval=True)

Will train until eval error hasn't decreased in 25 rounds.
[0]	train-mlogloss:2.468892	eval-mlogloss:2.470665
[1]	train-mlogloss:2.455580	eval-mlogloss:2.459051
[2]	train-mlogloss:2.444629	eval-mlogloss:2.449542
[3]	train-mlogloss:2.435433	eval-mlogloss:2.441700
[4]	train-mlogloss:2.427757	eval-mlogloss:2.435194
[5]	train-mlogloss:2.421130	eval-mlogloss:2.429784
[6]	train-mlogloss:2.415604	eval-mlogloss:2.425247
[7]	train-mlogloss:2.410758	eval-mlogloss:2.421452
[8]	train-mlogloss:2.406650	eval-mlogloss:2.418265
[9]	train-mlogloss:2.403081	eval-mlogloss:2.415569
[10]	train-mlogloss:2.400021	eval-mlogloss:2.413291
[11]	train-mlogloss:2.397284	eval-mlogloss:2.411361
[12]	train-mlogloss:2.394866	eval-mlogloss:2.409735
[13]	train-mlogloss:2.392821	eval-mlogloss:2.408338
[14]	train-mlogloss:2.391006	eval-mlogloss:2.407150
[15]	train-mlogloss:2.389378	eval-mlogloss:2.406129
[16]	train-mlogloss:2.387954	eval-mlogloss:2.405275
[17]	train-mlogloss:2.386594	eval-mlogloss:2.404524
[18]	train-mlog

In [48]:
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 40, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

In [49]:
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('fine_tune.gz', index=True,
              index_label='device_id', compression="gzip")