<a href="https://www.kaggle.com/c/talkingdata-mobile-user-demographics/data">Talking Data on Kaggle</a>
<img src="domain.png" width="600"/>


In [2]:
%matplotlib inline
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation, ensemble, preprocessing, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, confusion_matrix
submission = False

# Submission

In [3]:

def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

def handleTimestamp(df):
    dateTimeStr = df.timestamp
    dateTimeStr = dateTimeStr.fillna("2016-05-03 05:04:58")
    datetime = pd.to_datetime(dateTimeStr)
    # create some new features
    df['year'] = datetime.dt.year.astype(int)
    df['month'] = datetime.dt.month.astype(int)
    df['dayofweek'] = datetime.dt.dayofweek.astype(int)
    df['day'] = datetime.dt.day.astype(int)
    df['hour'] = datetime.dt.hour.astype(int)
    df['minute'] = datetime.dt.minute.astype(int)
    
    df = df.drop(['timestamp'], axis=1)
    return df

# Plots the frequency count, figSize=(5,4)
def barPlot(df, col=None, N=None, figSize=None):
    if N is None:
        N = df[col].unique().shape[0]

    freqs = df[col].value_counts()[:N]
    index = np.arange(0,N)
    width=0.8
    fig, ax = plt.subplots(figsize=figSize)
    rects1 = ax.bar(index, freqs, width, color='red', alpha=0.5)
    plt.ylabel('Frequency')
    plt.title(col)
    plt.xticks(index + width/2., freqs.index)
    plt.xticks(rotation=60)

    plt.show()

In [4]:

print "Read Labels..."
# Merge the labels
appLabelsDF = pd.read_csv("data/app_labels.csv", low_memory=False, 
                  dtype= {'app_id': np.str, 'label_id': np.str, 
                          'is_installed': np.int, 'is_active':np.int})
lblCatsDF = pd.read_csv("data/label_categories.csv", low_memory=False, 
                  dtype= {'label_id': np.str})
lblCatsDF.category.fillna('unknown')
lblCatsDF.category = lblCatsDF.category.map(lambda x: str(x).lower())

appLabelsDF = appLabelsDF.merge(lblCatsDF, how="left", on="label_id")      
appLabelsDF = appLabelsDF.drop('label_id', axis=1)

# topCatCols = ['industry tag', 'property industry 2.0', 'property industry 1.0',
# 'custom label', 'services 1', 'and the church',
# 'internet banking', 'finance', 'p2p', 'low risk', 'p2p net loan',
# 'liquid medium', 'relatives 1', 'pay', 'high risk', '1 free',
# 'wealth management', 'personal effectiveness 1', 'im',
# 'higher income', 'financial', 'mobile bank', 'low income',
# 'debit and credit', 'online malls', 'low liquidity', 'video',
# 'pursue', 'financial services', 'low profitability',
# 'moderate profitability', 'high flow', 'fashion',
# 'third party payment', 'bank financing', 'direct banking',
# 'total cost 1', 'direct bank', 'science and technology', 'cozy 1',
# 'tencent', 'music', 'fixed income', 'smart shopping 1',
# 'securities', 'consumer finance', 'ds_p2p net loan', 'imf',
# 'air travel']

# catCols = ['Cat_'+x for x in topCatCols[:40]]

# print('Dummifying category labels...')
# appLabelsDF = pd.get_dummies(appLabelsDF, prefix='Cat', columns=['category'] )
# appLabelsDF = appLabelsDF[['app_id'] + catCols]
# appLabelsDF[catCols] = appLabelsDF[catCols].astype(int)

# catCols = [x for x in appLabelsDF.columns if "cat_" in x]
catCols = {'game':['game', 'gaming'], 
           'industry':['industry'], 
           'property':['property'],
           'service': ['service'], 
           'church':['church'], 
           'p2p':['p2p'], 
           'finance':['financ', 'financial', 'debit', 'credit', 'account', 'business'],
           'investment': ['fund', 'stock', 'loan', 'securities', 'insurance', 
                          'profit', 'risk', 'liquidity', 'futures'], 
           'im':['im'], 
           'relative':['relative'], 
           'news':['news'],
           'bank':['bank'], 
           'pursue':['pursue'],  
           'risk':['risk'], 
           'income':['income', 'pay'], 
           'fashion':['fashion', 'trend', 'cool' ], 
           'shop':['shop', 'price', 'groupon', 'free', 'mall'], 
           'science':['science', 'techno'], 
           'video':['video', 'film', 'movie', 'show'], 
           'music':['music', 'radio'], 
           'wealth':['wealth'], 
           'travel':['travel', 'tour', 'taxi', 'map', 'car', 'navigat', 'rail', 
                     'flight', 'bus', 'hotel', 'transport'],
           'education': ['educat', 'literature', 'class', 'exam', 'college' ],
           'health': ['health', 'sports', 'gym', 'exercise', 'vitality'],
           'photo':['photo', 'picture', ],
           'readers': ['read', 'blog', 'novel', 'book', 'comic'],
           'baby': ['baby', 'pregnan'],
           'medical':['medical']}

for catCol in catCols.keys():
    print "Processing: ", catCol
    for catKey in catCols[catCol]:
        appLabelsDF[catCol] = [1 if catKey in x else None for x in appLabelsDF.category]

appLabelsDF = appLabelsDF.groupby('app_id')[catCols.keys()].sum()
appLabelsDF.reset_index(inplace=True)

print('Merging with events...')
appEventsDF = pd.read_csv("data/app_events.csv", low_memory=False, 
                  dtype= {'app_id': np.str, 'event_id': np.str})
#appEventsDF = appEventsDF[appEventsDF.is_active==1]
appEventsDF = appEventsDF.merge(appLabelsDF, how="left", on="app_id")
appEventsDF = appEventsDF.groupby('event_id')[catCols.keys()].sum()
appEventsDF.reset_index(inplace=True)
print "App Events DF loaded..."

# Events
print('Read events...')
events = pd.read_csv("./data/events.csv", dtype={'device_id': np.str, 'event_id': np.str})
events = pd.merge(events, appEventsDF, how="left", on="event_id", left_index=True)

print "Aggregating cat cols..."
events_acc = events.groupby('device_id')[catCols.keys()].sum()
events_acc.reset_index(inplace=True)

events.longitude = [int(lo) if lo > 0 else None for lo in events.longitude]    
events.latitude = [int(lo) if lo > 0 else None for lo in events.latitude]
events.longitude = np.digitize(events.longitude, np.linspace(0,max(events.longitude),20))
events.latitude = np.digitize(events.latitude, np.linspace(0,max(events.latitude),20))    

events = handleTimestamp(events)

events_llt = events.groupby('device_id')[['longitude', 'latitude', 'dayofweek', 'hour']].median()
events_llt.reset_index(inplace=True)

events_acc = pd.merge(events_acc, events_llt, how="left", on="device_id", left_index=True)
print('Writing to file...')
events_acc.to_csv("eventApps.csv", index=False)
print('Done!')

Read Labels...
Processing:  shop
Processing:  fashion
Processing:  photo
Processing:  relative
Processing:  video
Processing:  church
Processing:  education
Processing:  investment
Processing:  risk
Processing:  wealth
Processing:  service
Processing:  readers
Processing:  travel
Processing:  music
Processing:  income
Processing:  health
Processing:  finance
Processing:  game
Processing:  p2p
Processing:  baby
Processing:  news
Processing:  bank
Processing:  pursue
Processing:  science
Processing:  industry
Processing:  im
Processing:  property
Processing:  medical
Merging with events...
App Events DF loaded...
Read events...
Aggregating cat cols...
Writing to file...
Done!


In [5]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn import cross_validation, metrics
import re

random.seed(2016)
    
def run_xgb(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 5
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(
            eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 500
    early_stopping_rounds = 50
    test_size = 0.3

    #X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
                        train, train.group, test_size=0.3, random_state=0)
    
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
#     y_train = X_train[target]
#     y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, 
                    early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()



def read_train_test():

    events_acc = pd.read_csv("eventApps.csv", low_memory=False, 
                                      dtype= {'device_id': np.str})

    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv("./data/phone_brand_device_model.csv", dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')
    
    #Timestamp
    

    # Train
    print('Read train...')
    train = pd.read_csv("./data/gender_age_train.csv", dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_acc, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)
    
    # Test
    print('Read test...')
    test = pd.read_csv("./data/gender_age_test.csv", dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_acc, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)
    
    
    # Features
    features = list(test.columns.values)
    #features = ['phone_brand','device_model']
    features.remove('device_id')
    #features.remove('event_id')
    #features.remove('timestamp')

    return train, test, features


train, test, features = read_train_test()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_xgb(train, test, features, 'group')

#create_submission(score, test, test_prediction)

Read brands...
Read train...
Read test...
('Length of train: ', 74645)
('Length of test: ', 112071)
Features [34]: ['baby', 'bank', 'church', 'dayofweek', 'device_model', 'education', 'fashion', 'finance', 'game', 'health', 'hour', 'im', 'income', 'industry', 'investment', 'latitude', 'longitude', 'medical', 'music', 'news', 'p2p', 'phone_brand', 'photo', 'property', 'pursue', 'readers', 'relative', 'risk', 'science', 'service', 'shop', 'travel', 'video', 'wealth']
XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.7, COLSAMPLE_BY_TREE: 0.7
('Length train:', 52251)
('Length valid:', 22394)


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-mlogloss:2.470716	eval-mlogloss:2.473666
[1]	train-mlogloss:2.457588	eval-mlogloss:2.462728
[2]	train-mlogloss:2.445729	eval-mlogloss:2.453182
[3]	train-mlogloss:2.434854	eval-mlogloss:2.444403
[4]	train-mlogloss:2.425210	eval-mlogloss:2.436622
[5]	train-mlogloss:2.416497	eval-mlogloss:2.430084
[6]	train-mlogloss:2.407847	eval-mlogloss:2.423815
[7]	train-mlogloss:2.400530	eval-mlogloss:2.418463
[8]	train-mlogloss:2.393397	eval-mlogloss:2.413527
[9]	train-mlogloss:2.386638	eval-mlogloss:2.408726
[10]	train-mlogloss:2.380856	eval-mlogloss:2.404810
[11]	train-mlogloss:2.375119	eval-mlogloss:2.400925
[12]	train-mlogloss:2.369834	eval-mlogloss:2.397434
[13]	train-mlogloss:2.364846	eval-mlogloss:2.394450
[14]	train-mlogloss:2.360113	eval-mlogloss:2.391608
[15]	train-mlogloss:2.355510	eval-mlogloss:2.388810
[16]	train-mlogloss:2.351429	eval-mlogloss:2.386533
[17]	train-mlogloss:2.347353	eval-mlogloss:2.384270
[18]	train-mlog

Validating...
Predict test set...
Training time: 2.9 minutes


In [6]:
%qtconsole --ConsoleWidget.font_family="Menlo" --ConsoleWidget.font_size=14