In [48]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

random.seed(11111)

def run_xgb(train, test, features, target, random_state=0):
    eta = 0.01
    max_depth = 5
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 10000
    early_stopping_rounds = 50
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score)+ '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table


def read_train_test():
    # Events
    print('Read events...')
    events = pd.read_csv("events.csv", dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

    # Phone brand
    print('Read brands...')
    brands = pd.read_csv("phone_brand_device_model.csv", dtype={'device_id': np.str})
    brands.drop_duplicates('device_id', keep='first', inplace=True)
    brands = pd.get_dummies(brands, columns = ['phone_brand','device_model'])

    
    # Train
    print('Read train...')
    train = pd.read_csv("gender_age_train.csv", dtype={'device_id': np.str})
    train = map_column(train,'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, brands, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv("gender_age_test.csv", dtype={'device_id': np.str})
    test = pd.merge(test, brands, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features


train, test, features = read_train_test()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_xgb(train, test, features, 'group')
print("LS: {}".format(round(score, 5)))
create_submission(score, test, test_prediction)

Read events...
Read brands...
Read train...
Read test...
('Length of train: ', 74645)
('Length of test: ', 112071)
Features [1731]: ['counts', 'device_model_1100', 'device_model_1105', 'device_model_1107', 'device_model_2', 'device_model_201', 'device_model_2016\xe7\x89\x88 Galaxy A5', 'device_model_2016\xe7\x89\x88 Galaxy A7', 'device_model_2016\xe7\x89\x88 Galaxy A9', 'device_model_2016\xe7\x89\x88 Galaxy J7', 'device_model_210R', 'device_model_2C', 'device_model_3', 'device_model_3000', 'device_model_3005', 'device_model_3007', 'device_model_302U', 'device_model_3S', 'device_model_5200', 'device_model_5200S', 'device_model_5216D', 'device_model_5216s', 'device_model_5217', 'device_model_5218S', 'device_model_5219', 'device_model_5261', 'device_model_5263', 'device_model_5263S', 'device_model_5310', 'device_model_5311', 'device_model_5313S', 'device_model_5315', 'device_model_5316', 'device_model_5360', 'device_model_5832', 'device_model_5860A', 'device_model_5860S', 'device_model_58

Will train until eval error hasn't decreased in 50 rounds.
[0]	train-mlogloss:2.483959	eval-mlogloss:2.483933
[1]	train-mlogloss:2.482941	eval-mlogloss:2.482965
[2]	train-mlogloss:2.482006	eval-mlogloss:2.482021
[3]	train-mlogloss:2.481030	eval-mlogloss:2.481082
[4]	train-mlogloss:2.480077	eval-mlogloss:2.480118
[5]	train-mlogloss:2.479101	eval-mlogloss:2.479171
[6]	train-mlogloss:2.478215	eval-mlogloss:2.478292
[7]	train-mlogloss:2.477354	eval-mlogloss:2.477415
[8]	train-mlogloss:2.476462	eval-mlogloss:2.476551
[9]	train-mlogloss:2.475543	eval-mlogloss:2.475633
[10]	train-mlogloss:2.474656	eval-mlogloss:2.474789
[11]	train-mlogloss:2.473844	eval-mlogloss:2.473966
[12]	train-mlogloss:2.472955	eval-mlogloss:2.473082
[13]	train-mlogloss:2.472066	eval-mlogloss:2.472216
[14]	train-mlogloss:2.471196	eval-mlogloss:2.471374
[15]	train-mlogloss:2.470318	eval-mlogloss:2.470519
[16]	train-mlogloss:2.469468	eval-mlogloss:2.469676
[17]	train-mlogloss:2.468637	eval-mlogloss:2.468848
[18]	train-mlog

Validating...
Predict test set...
Training time: 8.71 minutes
LS: 2.37824
('Writing submission: ', 'submission_2.37824264176.csv')


In [34]:
print('hi')


hi


In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import log_loss

# Create bag-of-apps in character string format
# first by event
# then merge to generate larger bags by device

##################
#   App Events
##################
print("# Read App Events")
app_ev = pd.read_csv("app_events.csv", dtype={'device_id': np.str})
# remove duplicates(app_id)
app_ev = app_ev.groupby("event_id")["app_id"].apply(
    lambda x: " ".join(set("app_id:" + str(s) for s in x)))

##################
#     Events
##################
print("# Read Events")
events = pd.read_csv("events.csv", dtype={'device_id': np.str})
events["app_id"] = events["event_id"].map(app_ev)

events = events.dropna()

del app_ev

events = events[["device_id", "app_id"]]

# remove duplicates(app_id)
events = events.groupby("device_id")["app_id"].apply(
    lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" "))))
events = events.reset_index(name="app_id")

# expand to multiple rows
events = pd.concat([pd.Series(row['device_id'], row['app_id'].split(' '))
                    for _, row in events.iterrows()]).reset_index()
events.columns = ['app_id', 'device_id']

##################
#   Phone Brand
##################
print("# Read Phone Brand")
pbd = pd.read_csv("phone_brand_device_model.csv",
                  dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)


# Read App Events
# Read Events
# Read Phone Brand
# Generate Train and Test
# User-Item-Feature
# Feature Selection
('# Num of Features: ', 3145)


Will train until eval error hasn't decreased in 100 rounds.
[0]	train-mlogloss:2.425803	eval-mlogloss:2.433603
[1]	train-mlogloss:2.396579	eval-mlogloss:2.408893
[2]	train-mlogloss:2.375306	eval-mlogloss:2.391424
[3]	train-mlogloss:2.358278	eval-mlogloss:2.377790
[4]	train-mlogloss:2.344011	eval-mlogloss:2.366587
[5]	train-mlogloss:2.331691	eval-mlogloss:2.357116
[6]	train-mlogloss:2.320893	eval-mlogloss:2.348967
[7]	train-mlogloss:2.311298	eval-mlogloss:2.341870
[8]	train-mlogloss:2.302695	eval-mlogloss:2.335638
[9]	train-mlogloss:2.294933	eval-mlogloss:2.330126
[10]	train-mlogloss:2.287875	eval-mlogloss:2.325228
[11]	train-mlogloss:2.281454	eval-mlogloss:2.320856
[12]	train-mlogloss:2.275564	eval-mlogloss:2.316942
[13]	train-mlogloss:2.270142	eval-mlogloss:2.313422
[14]	train-mlogloss:2.265153	eval-mlogloss:2.310246
[15]	train-mlogloss:2.260525	eval-mlogloss:2.307378
[16]	train-mlogloss:2.256242	eval-mlogloss:2.304780
[17]	train-mlogloss:2.252247	eval-mlogloss:2.302422
[18]	train-mlo

# Train


In [22]:


##################
#  Train and Test
##################
print("# Generate Train and Test")

train = pd.read_csv("gender_age_train.csv",
                    dtype={'device_id': np.str})
train.drop(["age", "gender"], axis=1, inplace=True)

test = pd.read_csv("gender_age_test.csv",
                   dtype={'device_id': np.str})
test["group"] = np.nan


split_len = len(train)

# Group Labels
Y = train["group"]
lable_group = LabelEncoder()
Y = lable_group.fit_transform(Y)
device_id = test["device_id"]

# Concat
Df = pd.concat((train, test), axis=0, ignore_index=True)

Df = pd.merge(Df, pbd, how="left", on="device_id")
Df["phone_brand"] = Df["phone_brand"].apply(lambda x: "phone_brand:" + str(x))
Df["device_model"] = Df["device_model"].apply(
    lambda x: "device_model:" + str(x))


###################
#  Concat Feature
###################

f1 = Df[["device_id", "phone_brand"]]   # phone_brand
f2 = Df[["device_id", "device_model"]]  # device_model
f3 = events[["device_id", "app_id"]]    # app_id

del Df

f1.columns.values[1] = "feature"
f2.columns.values[1] = "feature"
f3.columns.values[1] = "feature"

FLS = pd.concat((f1, f2, f3), axis=0, ignore_index=True)


###################
# User-Item Feature
###################
print("# User-Item-Feature")

device_ids = FLS["device_id"].unique()
feature_cs = FLS["feature"].unique()

data = np.ones(len(FLS))
dec = LabelEncoder().fit(FLS["device_id"])
row = dec.transform(FLS["device_id"])
col = LabelEncoder().fit_transform(FLS["feature"])
sparse_matrix = sparse.csr_matrix(
    (data, (row, col)), shape=(len(device_ids), len(feature_cs)))

sparse_matrix = sparse_matrix[:, sparse_matrix.getnnz(0) > 0]

##################
#      Data
##################

train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(
    train_sp, Y, train_size=.90, random_state=10)


##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 12,
    "booster": "gblinear",
    "max_depth": 4,
    "eval_metric": "mlogloss",
    "eta": 0.1,
    "silent": 1,
    "alpha": 3,
}

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 1000, evals=watchlist,
                early_stopping_rounds=50, verbose_eval=True)

print("# Train")
dtrain = xgb.DMatrix(train_sp, Y)
gbm = xgb.train(params, dtrain, 1000, verbose_eval=True)
y_pre = gbm.predict(xgb.DMatrix(test_sp))

# Write results
result = pd.DataFrame(y_pre, columns=lable_group.classes_)
result["device_id"] = device_id
result = result.set_index("device_id")
result.to_csv('new_test.csv', index=True,
              index_label='device_id')


# Generate Train and Test
# User-Item-Feature
# Feature Selection
('# Num of Features: ', 4823)


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-mlogloss:2.397776	eval-mlogloss:2.410163
[1]	train-mlogloss:2.358255	eval-mlogloss:2.378126
[2]	train-mlogloss:2.330738	eval-mlogloss:2.356951
[3]	train-mlogloss:2.309720	eval-mlogloss:2.341510
[4]	train-mlogloss:2.292903	eval-mlogloss:2.329682
[5]	train-mlogloss:2.279073	eval-mlogloss:2.320390
[6]	train-mlogloss:2.267483	eval-mlogloss:2.312913
[7]	train-mlogloss:2.257625	eval-mlogloss:2.306854
[8]	train-mlogloss:2.249143	eval-mlogloss:2.301884
[9]	train-mlogloss:2.241797	eval-mlogloss:2.297830
[10]	train-mlogloss:2.235373	eval-mlogloss:2.294460
[11]	train-mlogloss:2.229717	eval-mlogloss:2.291671
[12]	train-mlogloss:2.224712	eval-mlogloss:2.289370
[13]	train-mlogloss:2.220256	eval-mlogloss:2.287459
[14]	train-mlogloss:2.216272	eval-mlogloss:2.285873
[15]	train-mlogloss:2.212683	eval-mlogloss:2.284560
[16]	train-mlogloss:2.209453	eval-mlogloss:2.283468
[17]	train-mlogloss:2.206527	eval-mlogloss:2.282582
[18]	train-mlog

# Train
