In [2]:
import keras
from keras.layers.core import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils.np_utils import to_categorical
import util
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pylab as plt
import h5py
%matplotlib inline
from sklearn.externals import joblib

seed = 25
random.seed(seed)
np.random.seed(seed)

In [3]:
def print_metrics(qid,preds,truth):

    print("ndcg10: {}".format(util.get_ndcg(qid,preds,truth,k=10)))
    print("ndcg5: {}".format(util.get_ndcg(qid,preds,truth,k=5)))
    print("map: {}".format(util.get_mapk(qid, preds, truth)))
    print("f1-micro: {}".format(metrics.f1_score(y_true=truth,y_pred=preds,average="micro")))
    print("err@10: {}".format(util.get_err(qid,preds,truth,k=10)))
    print("err@5: {}".format(util.get_err(qid,preds,truth,k=5)))

## Data Preparation

In [4]:
#load data

#small
train_small_df = pd.read_csv("../train_set_small_cleaned.csv")
test_small_df = pd.read_csv("../test_set_small_cleaned.csv")

#large
train_large_df = pd.read_csv("../train_set_large_cleaned.csv")
test_large_df = pd.read_csv("../test_set_large_cleaned.csv")

#all
train_all_df= pd.read_csv("../train_cleaned.csv")
test_all_df= pd.read_csv("../test_cleaned.csv")

train_small_df.drop("Unnamed: 0",axis=1,inplace=True)
train_large_df.drop("Unnamed: 0",axis=1,inplace=True)
train_all_df.drop("Unnamed: 0",axis=1,inplace=True)
test_small_df.drop("Unnamed: 0",axis=1,inplace=True)
test_large_df.drop("Unnamed: 0",axis=1,inplace=True)
test_all_df.drop("Unnamed: 0",axis=1,inplace=True)


#Prepare data for training
#define features
features = [x for x in train_small_df.columns if x not in ["rel","qid"]]
target =["rel"]

#scale features
train_small_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(train_small_df[features])
test_small_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(test_small_df[features])

train_large_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(train_large_df[features])
test_large_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(test_large_df[features])

train_all_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(train_all_df[features])
test_all_df[features] = StandardScaler(with_mean=0,with_std=1).fit_transform(test_all_df[features])
#reshape
X_train_small = np.array(train_small_df[features])
y_train_small = np.array(train_small_df[target])

X_train_large = np.array(train_large_df[features])
y_train_large = np.array(train_large_df[target])

X_train_all = np.array(train_all_df[features])
y_train_all = np.array(train_all_df[target])

X_test_small = np.array(test_small_df[features])
y_test_small = np.array(test_small_df[target])

X_test_large = np.array(test_large_df[features])
y_test_large = np.array(test_large_df[target])

X_test_all = np.array(test_all_df[features])
y_test_all = np.array(test_all_df[target])



c, r = y_train_small.shape
y_train_small = y_train_small.reshape(c,)

c, r = y_test_small.shape
y_test_small = y_test_small.reshape(c,)

c, r = y_train_large.shape
y_train_large = y_train_large.reshape(c,)

c, r = y_test_large.shape
y_test_large = y_test_large.reshape(c,)

c, r = y_train_all.shape
y_train_all = y_train_all.reshape(c,)

c, r = y_test_all.shape
y_test_all = y_test_all.reshape(c,)

## xgboost

In [5]:
#===========Find best xgb model=============
action = "use_tuned_params"   #possible actions: "use_tuned_params", "tune_params"


#tune the hyperparameters | if false the already tuned parameters will be used
if action == "tune_params":
    n_estimators_range = [100,200,400]
    learning_rate_range = [0.0001,0.01,0.1,1]
    max_depth_range = [3,4,5,6,7]#list(range(3,10,2))
    min_child_weight_range = [1,2,3,4,5,6]#[1,2,3,4,5,6]
    scale_pos_weight_range = [0, 0.5, 1]
    subsample_range = [0.5, 0.6 ,0.7]
    colsample_bytree_range = [0.5, 0.6, 0.7]
    reg_alpha_range = [105, 108,110,112, 115]


    tuned_parameters = [{'max_depth': max_depth_range,"learning_rate":learning_rate_range,"n_estimators":n_estimators_range,
                         "gamma":gamma_range,"min_child_weight":min_child_weight_range,
                        "subsample":subsample_range,"colsample_bytree": colsample_bytree_range,
                         "scale_pos_weight": scale_pos_weight_range,"reg_alpha":reg_alpha_range}]

    scores=["f1_micro"]
    for score in scores:
        xgb_model = GridSearchCV(XGBClassifier(objective="multi:softmax",nthread=-1),param_grid=tuned_parameters,scoring=score,cv=3)
        xgb_model.fit(X_train_small, y_train_small)

        print("parameters selected: ",xgb_model.best_params_)
        print(" ")
        print("Train set:")
        print(score+":",xgb_model.best_score_)
        print(" ")
    #------------------------------------------------------------

#use tuned parameters    
elif action ==  "use_tuned_params":    
    
    n_estimators_range = [200]
    learning_rate_range = [0.1]
    max_depth_range = [3]
    min_child_weight_range = [5]
    gamma_range = [0]
    scale_pos_weight_range = [0]
    subsample_range = [0.6]
    colsample_bytree_range = [0.7]
    reg_alpha_range = [110]

#define best found model tuned on the small dataset
    xgb_model = XGBClassifier( objective="multi:softmax",

                        n_estimators=n_estimators_range[0], 
                         learning_rate=learning_rate_range[0],
                        reg_alpha=reg_alpha_range[0],
                        gamma= gamma_range[0],
                        subsample= subsample_range[0],
                        min_child_weight=min_child_weight_range[0],
                        colsample_bytree=colsample_bytree_range[0],
                        max_depth=max_depth_range[0],
                        scale_pos_weight= gamma_range[0])

### XGBoost model - trained on small dataset

In [6]:
load_pretrained_model = True

if load_pretrained_model:
    
    filename_small = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_small.joblib.pkl"
    xgboost_small = joblib.load(filename_small)
    print_metrics(test_small_df.qid,xgboost_small.predict(X_test_small),test_small_df.rel)
    
else:  
    xgb_model.fit(X_train_small,y_train_small)
    filename_small = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_small.joblib.pkl"
    _ = joblib.dump(xgb_model, filename_small, compress=9)
    
    xgboost_small = joblib.load(filename_small)
    print_metrics(test_small_df.qid,xgboost_small.predict(X_test_small),test_small_df.rel)

ndcg10: 0.8246824680688686
ndcg5: 0.8235922419976897
map: 0.6576216627533964
f1-micro: 0.58
err@10: 0.32991239686000884
err@5: 0.3078691262737157


### XGBoost model - trained on large dataset

In [7]:
load_pretrained_model = True

if load_pretrained_model:
    filename_large = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_large.joblib.pkl"
    xgboost_large = joblib.load(filename_large)
    print_metrics(test_large_df.qid,xgboost_large.predict(X_test_large),test_large_df.rel)

else:
    xgb_model.fit(X_train_large,y_train_large)
    filename_large = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_large.joblib.pkl"
    _ = joblib.dump(xgb_model, filename_large, compress=9)

    xgboost_large = joblib.load(filename_large)
    print_metrics(test_large_df.qid,xgboost_large.predict(X_test_large),test_large_df.rel)

ndcg10: 0.8750850708775263
ndcg5: 0.8805959601528669
map: 0.6854773756434743
f1-micro: 0.57875
err@10: 0.4449925150616616
err@5: 0.42641241170365807


### XGBoost model - trained on all data

In [8]:
load_pretrained_model = True

if load_pretrained_model:
    filename_all = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_all.joblib.pkl"
    xgboost_all = joblib.load(filename_all)
    print_metrics(test_all_df.qid,xgboost_all.predict(X_test_all),test_all_df.rel)

else:
    xgb.fit(X_train_all,y_train_all)
    filename_all = "/home/andreas/Desktop/irdm/models/xgboost-trained_on_all.joblib.pkl"
    _ = joblib.dump(xgb, filename_all, compress=9)

    xgboost_all = joblib.load(filename_all)
    print_metrics(test_all_df.qid,xgboost_all.predict(X_test_all),test_all_df.rel)

ndcg10: 0.7433620383539687
ndcg5: 0.7095928239953582
map: 0.6885408057821056
f1-micro: 0.5612141387291374
err@10: 0.4601015817360638
err@5: 0.44891153947187246


## Neural Network

In [9]:
#prepare data for neural network
y_train_small = to_categorical(y_train_small, num_classes=None)
y_test_small = to_categorical(y_test_small, num_classes=None)

y_train_large = to_categorical(y_train_large, num_classes=None)
y_test_large = to_categorical(y_test_large, num_classes=None)

y_train_all = to_categorical(y_train_all, num_classes=None)
y_test_all = to_categorical(y_test_all, num_classes=None)

In [10]:
#Model
epochs = 20

model=keras.models.Sequential()
model.add(Dense(activation="relu",input_dim=136,units=700, kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.1))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=600,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=450,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=300,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.25))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=250,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.25))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=100,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.2))
model.add(BatchNormalization())
model.add(Dense(activation="relu",units=45,kernel_initializer="TruncatedNormal"))
model.add(Dropout(rate=0.1))
model.add(BatchNormalization())
model.add(Dense(activation="softmax",units=5,kernel_initializer="TruncatedNormal"))

    
adam = keras.optimizers.SGD(nesterov=True)
#adam = keras.optimizers.adam()
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics= ["accuracy"])

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=1, min_lr=0.001)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',patience=4,verbose=1)

### NN model - trained on small dataset

In [11]:
load_pretrained_model = True

if load_pretrained_model:
    filename_small_nn = "/home/andreas/Desktop/irdm/models/nn-small.h5"
    nn_small = keras.models.load_model(filepath=filename_small_nn)
    print_metrics(test_small_df.qid,nn_small.predict_classes(X_test_small,verbose=0),test_small_df.rel)

else:

    model.fit(X_train_small,y_train_small,batch_size=128,epochs=epochs,\
              validation_split=0,shuffle=True,validation_data=(X_test_small,y_test_small),\
              verbose=2,callbacks=[early_stopping])    

    filename_small_nn = "/home/andreas/Desktop/irdm/models/nn-small.h5"
    model.save(filepath=filename_small_nn)

    nn_small = keras.models.load_model(filepath=filename_small_nn)
    print_metrics(test_small_df.qid,nn_small.predict_classes(X_test_small,verbose=0),test_small_df.rel)

ndcg10: 0.7705191639381013
ndcg5: 0.7523945903295363
map: 0.6516980250409213
f1-micro: 0.58625
err@10: 0.2554860929443916
err@5: 0.24656176957935208


### NN model - trained on large dataset

In [12]:
load_pretrained_model = True

if load_pretrained_model:
    filename_large_nn = "/home/andreas/Desktop/irdm/models/nn-large.h5"
    nn_large = keras.models.load_model(filepath=filename_large_nn)
    print_metrics(test_large_df.qid,nn_large.predict_classes(X_test_large,verbose=0),test_large_df.rel)

else:

    model.fit(X_train_large,y_train_large,batch_size=128,epochs=epochs,\
              validation_split=0,shuffle=True,validation_data=(X_test_large,y_test_large),\
              verbose=2,callbacks=[early_stopping])    

    filename_large_nn = "/home/andreas/Desktop/irdm/models/nn-large.h5"
    model.save(filepath=filename_large_nn)

    nn_large = keras.models.load_model(filepath=filename_large_nn)
    print_metrics(test_large_df.qid,nn_large.predict_classes(X_test_large,verbose=0),test_large_df.rel)

ndcg10: 0.8404642908974927
ndcg5: 0.8429107121986785
map: 0.6924643190012074
f1-micro: 0.59125
err@10: 0.43649615504114475
err@5: 0.4218199224697093


### NN model - trained on all data

In [13]:
load_pretrained_model = True

if load_pretrained_model:
    filename_all_nn = "/home/andreas/Desktop/irdm/models/nn-all.h5"
    nn_all = keras.models.load_model(filepath=filename_all_nn)
    print_metrics(test_all_df.qid,nn_all.predict_classes(X_test_all,verbose=0),test_all_df.rel)

else:

    model.fit(X_train_all,y_train_all,batch_size=128,epochs=epochs,\
              validation_split=0,shuffle=True,validation_data=(X_test_all,y_test_all),\
              verbose=0,callbacks=[early_stopping])    

    filename_all_nn = "/home/andreas/Desktop/irdm/models/nn-all.h5"
    model.save(filepath=filename_all_nn)

    nn_all = keras.models.load_model(filepath=filename_all_nn)
    print_metrics(test_all_df.qid,nn_all.predict_classes(X_test_all,verbose=0),test_all_df.rel)

ndcg10: 0.8085064464336663
ndcg5: 0.7833719857168615
map: 0.6912292367507209
f1-micro: 0.5560303244852415
err@10: 0.43087259492275753
err@5: 0.41675862303028366
