In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from time import time
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

%matplotlib inline

In [2]:
np.random.seed(42)

In [3]:
train_clicks=pd.read_csv('raw_data/train.csv')
test_clicks=pd.read_csv('raw_data/test.csv',dtype={'id':'str'})

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,Lasso,LogisticRegressionCV,RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss,roc_auc_score
from lightgbm import LGBMClassifier

dev_clicks=train_clicks[(train_clicks.hour>=14103000)&(train_clicks.hour<14103100)]
train_clicks=train_clicks[train_clicks.hour<14103000]

y_train=train_clicks.click
X_train=train_clicks.drop(['id','click'],axis=1) 

y_dev=dev_clicks.click
X_dev=dev_clicks.drop(['id','click'],axis=1) 

X_test=test_clicks.drop(['id'],axis=1) 
X_total=pd.concat([X_train,X_dev,X_test])


for c in X_total.columns:
    if X_total[c].dtype=='object':
        X_total[c]=X_total[c].factorize()[0]

X_train=X_total.iloc[:len(X_train)]
X_dev=X_total.iloc[len(X_train):len(X_train)+len(X_dev)]
X_test=X_total.iloc[len(X_train)+len(X_dev):]


In [5]:
clf_types=['rf','lgb']
for ct in clf_types:
    t=time()
    if ct == 'rf':
        clf = RandomForestClassifier(n_jobs=4)
        clf.fit(X_train,y_train)
    elif ct == 'lgb':
        clf = LGBMClassifier(n_jobs=4)
        clf.fit(X_train,y_train,categorical_feature=list(X_train.columns))
        
    print('fitting time cost %ds'%int(time()-t))
    print('train score %.4f, dev score %.4f'%(log_loss(y_train, clf.predict_proba(X_train)[:,1]), log_loss(y_dev, clf.predict_proba(X_dev)[:,1])))
    print(sorted(zip(X_train.columns,clf.feature_importances_),key=lambda x:x[1],reverse=True))

    y_pred=clf.predict_proba(X_test)[:,1]
    results=pd.DataFrame({'id':test_clicks.id,'click':y_pred})
    results[['id','click']].to_csv('output/%s.csv'%ct,index=False)
    

fitting time cost 598s
train score 0.1459, dev score 1.0691
[('device_ip', 0.42035866330989363), ('hour', 0.16858141526249387), ('device_model', 0.15868151769375941), ('C14', 0.045293103320574936), ('device_id', 0.041472948230650529), ('site_id', 0.02922273138580575), ('site_domain', 0.0246481143817379), ('C20', 0.020446196144617682), ('C16', 0.015096023252558216), ('app_id', 0.01299426415418356), ('C21', 0.011171039600509467), ('C17', 0.0090214140536730401), ('C19', 0.0068732624717930801), ('site_category', 0.0059812673452500189), ('C18', 0.0053644521208280303), ('app_domain', 0.0050323904564475658), ('C15', 0.0044467952851785803), ('app_category', 0.0044278651541555377), ('device_conn_type', 0.0042969577606632786), ('banner_pos', 0.0033595773694107037), ('C1', 0.0016513824690226233), ('device_type', 0.0015786187767925768)]




fitting time cost 494s
train score 0.3939, dev score 0.4002
[('site_id', 483), ('app_id', 477), ('C17', 411), ('device_model', 398), ('site_domain', 316), ('C14', 228), ('device_ip', 196), ('hour', 131), ('app_domain', 82), ('C20', 76), ('device_id', 56), ('C21', 31), ('site_category', 30), ('app_category', 24), ('banner_pos', 16), ('C19', 13), ('C1', 12), ('device_conn_type', 12), ('C16', 6), ('C15', 2), ('device_type', 0), ('C18', 0)]
