In [185]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import datetime
import time
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectPercentile, f_classif

In [3]:
# Phone brand
print('# Read brands')
pbd = pd.read_csv("phone_brand_device_model.csv", dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)

# Read brands


In [4]:
pbd.head()

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [5]:
# App Labels
print("# Read App Labels")
app_lab = pd.read_csv("app_labels.csv", dtype={'device_id': np.str})
app_lab = app_lab.groupby("app_id")["label_id"].apply(lambda x: " ".join(str(s) for s in x))

# Read App Labels


In [6]:
app_lab.head()

app_id
-9223281467940916832                796 795 794 405
-9222877069545393219                            135
-9222785464897897681                812 795 794 405
-9222198347540756780                810 795 794 405
-9221970424041518544    714 704 548 813 795 794 405
Name: label_id, dtype: object

In [7]:
# App Events
print("# Read App Events")
app_ev = pd.read_csv("app_events.csv", dtype={'device_id': np.str})
app_ev["app_lab"] = app_ev["app_id"].map(app_lab)
app_ev = app_ev.groupby("event_id")["app_lab"].apply(lambda x: " ".join(str(s) for s in x))

# Read App Events


In [8]:
app_ev.head()

event_id
2     549 710 704 548 172 721 704 548 302 303 251 26...
6     549 721 704 302 303 548 183 713 704 548 549 71...
7     549 721 704 548 548 549 186 721 704 548 303 30...
9     549 721 704 302 303 548 183 549 721 704 548 40...
16    549 721 704 302 303 548 183 549 713 704 405 54...
Name: app_lab, dtype: object

In [140]:
# Events
parser = lambda timestamp: pd.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
print("# Read Events")
events = pd.read_csv("events.csv", dtype={'device_id': np.str}, parse_dates=[2], date_parser=parser)
events['timestamp'] = events['timestamp'].apply(lambda x: time.mktime(x.timetuple()))
events.head()

# Read Events


Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,1462060525,121.38,31.24
1,2,-6401643145415154744,1462060452,103.65,30.97
2,3,-4833982096941402721,1462057685,106.6,29.7
3,4,-6815121365017318426,1462057600,104.27,23.28
4,5,-5373797595892518570,1462057638,115.88,28.66


In [141]:
events["app_lab"] = events["event_id"].map(app_ev)
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_lab
0,1,29182687948017175,1462060525,121.38,31.24,
1,2,-6401643145415154744,1462060452,103.65,30.97,549 710 704 548 172 721 704 548 302 303 251 26...
2,3,-4833982096941402721,1462057685,106.6,29.7,
3,4,-6815121365017318426,1462057600,104.27,23.28,
4,5,-5373797595892518570,1462057638,115.88,28.66,


In [142]:
#events[events['device_id'] == '-1000667340060427374']

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_lab
7868,7869,-1000667340060427374,1462522258,116.46,40.00,
22949,22950,-1000667340060427374,1462131814,116.69,39.87,
36493,36494,-1000667340060427374,1462260147,116.69,39.87,
60224,60225,-1000667340060427374,1462651315,0.00,0.00,713 704 548 163 158 549 721 704 302 303 917 91...
104194,104195,-1000667340060427374,1462131534,116.69,39.87,713 704 548 163 158 549 721 704 302 303 917 91...
104393,104394,-1000667340060427374,1462131880,116.69,39.87,
106517,106518,-1000667340060427374,1462474876,0.00,0.00,713 704 548 163 158 1007 128 128 130 756 761 7...
119876,119877,-1000667340060427374,1462659907,116.69,39.87,
177122,177123,-1000667340060427374,1462622469,0.00,0.00,713 704 548 163 158 549 721 704 302 303 917 91...
185009,185010,-1000667340060427374,1462131904,116.69,39.87,


In [143]:
put_together = lambda x: " ".join(str(s) for s in x)
events = events.groupby("device_id", as_index=False).agg({
    "app_lab": put_together, "timestamp": put_together, "longitude": put_together, 'latitude': put_together})

In [144]:
events.head()

Unnamed: 0,device_id,latitude,timestamp,longitude,app_lab
0,-100015673884079572,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....,1462434639.0 1462489551.0 1462604498.0 1462426...,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....,302 303 1012 130 548 549 405 730 756 761 777 7...
1,-1000458529741848912,0.0,1462410318.0,0.0,151 549 713 724 302 303 548 704 548 549 302 30...
2,-1000667340060427374,40.0 39.87 39.87 0.0 39.87 39.87 0.0 39.87 0.0...,1462522258.0 1462131814.0 1462260147.0 1462651...,116.46 116.69 116.69 0.0 116.69 116.69 0.0 116...,nan nan nan 713 704 548 163 158 549 721 704 30...
3,-100098646088222553,23.56 23.56 23.56 23.56,1462350732.0 1462350539.0 1462350579.0 1462350...,103.52 103.52 103.52 103.52,nan nan nan 128 847 128 847 130 549 718 704 54...
4,-100101996136889832,23.07 23.07 23.07 23.07 23.07 23.07 23.07 23.0...,1462657912.0 1462657805.0 1462657976.0 1462658...,114.4 114.4 114.4 114.4 114.4 114.4 114.4 114....,nan 959 960 548 1007 548 549 251 262 549 721 7...


In [188]:
# Test and Train
print("# Generate Train and Test")
train = pd.read_csv("gender_age_train.csv", dtype={'device_id': np.str})
train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
train = pd.merge(train, events, how='left', on='device_id', left_index=True)
train.head()

# Generate Train and Test


Unnamed: 0,device_id,gender,age,group,phone_brand,device_model,latitude,timestamp,longitude,app_lab
60864,-8076087639492063270,M,35,M32-38,小米,MI 2,,,,
60864,-2897161552818060146,M,35,M32-38,小米,MI 2,,,,
26625,-8260683887967679142,M,35,M32-38,小米,MI 2,0.0,1462109017.0,0.0,713 704 548 713 704 548 163 158 551 552 555 55...
60864,-4938849341048082022,M,30,M29-31,小米,红米note,,,,
60864,245133531816851882,M,30,M29-31,小米,MI 3,,,,


In [189]:
test = pd.read_csv("gender_age_test.csv", dtype={'device_id': np.str})
test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
test = pd.merge(test, events, how='left', on='device_id', left_index=True)
test.head()

Unnamed: 0,device_id,phone_brand,device_model,latitude,timestamp,longitude,app_lab
30492,1002079943728939269,小米,小米note,0.0 0.0 0.0 0.0 0.0 0.0 0.0,1462305989.0 1462482916.0 1462173602.0 1462116...,0.0 0.0 0.0 0.0 0.0 0.0 0.0,549 710 704 548 172 179 1017 562 564 251 691 1...
1962,-1547860181818787117,小米,红米2,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0,1462303266.0 1462106230.0 1462105900.0 1462210...,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0,713 704 548 252 691 713 704 548 751 755 775 78...
53772,7374582448058474277,华为,Y523-L176,0.0 0.0 0.0 0.0 0.0,1462654687.0 1462203934.0 1462384855.0 1462203...,0.0 0.0 0.0 0.0 0.0,711 714 548 704 813 795 794 405 27 549 710 704...
19086,-6220210354783429585,华为,荣耀6,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0,1462573824.0 1462058073.0 1462621519.0 1462430...,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0,549 713 704 405 548 730 731 732 774 778 783 10...
60864,-5893464122623104785,小米,MI 2,,,,


In [190]:
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)

device_id = test["device_id"].values

In [191]:
df = pd.concat((train, test), axis=0, ignore_index=True)
split_len = len(train)

# TF-IDF Feature
# tfv = TfidfVectorizer(min_df=1)
tfv = CountVectorizer(min_df=1, binary=0)
df = df[["phone_brand", "device_model", "app_lab"]].astype(np.str).apply(
    lambda x: " ".join(s for s in x), axis=1).fillna("Missing")
df_tfv = tfv.fit_transform(df)

train = df_tfv[:split_len, :]
test = df_tfv[split_len:, :]

In [194]:
train = normalize(train, copy=False)
test = normalize(test, copy=False)
print train

  (0, 1854)	0.57735026919
  (0, 1166)	0.57735026919
  (0, 1205)	0.57735026919
  (1, 1854)	0.57735026919
  (1, 1166)	0.57735026919
  (1, 1205)	0.57735026919
  (2, 1854)	0.0181101183238
  (2, 1166)	0.0181101183238
  (2, 353)	0.199211301561
  (2, 338)	0.488973194741
  (2, 266)	0.633854141331
  (2, 68)	0.0181101183238
  (2, 62)	0.0181101183238
  (2, 269)	0.0181101183238
  (2, 270)	0.0181101183238
  (2, 272)	0.0181101183238
  (2, 274)	0.0181101183238
  (2, 207)	0.14488094659
  (2, 203)	0.235431538209
  (2, 507)	0.0181101183238
  (2, 506)	0.0181101183238
  (2, 439)	0.0362202366475
  (2, 227)	0.0905505916188
  (2, 188)	0.0181101183238
  (2, 344)	0.0181101183238
  :	:
  (74644, 426)	0.0602891163612
  (74644, 430)	0.0301445581806
  (74644, 76)	0.0904336745418
  (74644, 5)	0.0602891163612
  (74644, 173)	0.0964625861779
  (74644, 404)	0.0301445581806
  (74644, 416)	0.0301445581806
  (74644, 424)	0.0301445581806
  (74644, 9)	0.0301445581806
  (74644, 431)	0.0301445581806
  (74644, 359)	0.030144558



In [195]:
##################
#   Feature Sel
##################
print("# Feature Selection")

X_train, X_val, y_train, y_val = train_test_split(train, Y, train_size=.80)

print("# Num of Features: ", X_train.shape[1])

selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)
test = selector.transform(test)

print("# Num of Features: ", X_train.shape[1])

# Feature Selection
('# Num of Features: ', 2045)
('# Num of Features: ', 470)


In [197]:
##################
#     XGBoost
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 6,
    "eval_metric": "mlogloss",
    "eta": 0.1,
    "silent": 1,
    "alpha": 3,
}
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist, verbose_eval=True)

[0]	train-mlogloss:2.43568	eval-mlogloss:2.43534
[1]	train-mlogloss:2.41452	eval-mlogloss:2.41466
[2]	train-mlogloss:2.40098	eval-mlogloss:2.40147
[3]	train-mlogloss:2.39132	eval-mlogloss:2.39202
[4]	train-mlogloss:2.38403	eval-mlogloss:2.38487
[5]	train-mlogloss:2.37834	eval-mlogloss:2.37929
[6]	train-mlogloss:2.3738	eval-mlogloss:2.37484
[7]	train-mlogloss:2.37011	eval-mlogloss:2.37123
[8]	train-mlogloss:2.36706	eval-mlogloss:2.36825
[9]	train-mlogloss:2.36451	eval-mlogloss:2.36576
[10]	train-mlogloss:2.36233	eval-mlogloss:2.36366
[11]	train-mlogloss:2.36046	eval-mlogloss:2.36186
[12]	train-mlogloss:2.35884	eval-mlogloss:2.36031
[13]	train-mlogloss:2.35742	eval-mlogloss:2.35897
[14]	train-mlogloss:2.35616	eval-mlogloss:2.35778
[15]	train-mlogloss:2.35504	eval-mlogloss:2.35674
[16]	train-mlogloss:2.35404	eval-mlogloss:2.35582
[17]	train-mlogloss:2.35314	eval-mlogloss:2.35499
[18]	train-mlogloss:2.35233	eval-mlogloss:2.35425
[19]	train-mlogloss:2.35159	eval-mlogloss:2.35357
[20]	train-

In [157]:
y_pre = gbm.predict(xgb.DMatrix(test))
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('test_Result.csv', index=True, index_label='device_id')