In [1]:
import gc
import pandas as pd
import numpy as np
from contextlib import contextmanager
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
from sklearn.preprocessing import MinMaxScaler, Binarizer
import xlearn as xl

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%pylab inline

pd.set_option('display.max_columns', 80)

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe:         
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df


SEED = 17

Populating the interactive namespace from numpy and matplotlib


In [2]:
df_train = pd.read_csv('onetwotrip_challenge_train.csv')
df = downcast_dtypes(df_train)
del df_train
gc.collect()

cols_to_rename = {'field0': 'days_from_last',
                 'field2': 'month_buy',
                 'field3': 'month_depart',
                 'field4': 'order_of_buy',
                 'field9': 'ticket_child_1y',
                 'field11': 'hour_depart',
                 'field15': 'total_tickets',
                 'field16': 'days_before_depart',
                 'field18': 'dow_buy',
                 'field20': 'dow_depart',
                 'field21': 'year',
                 'field24': 'ticket_adult',
                 'field28': 'ticket_child_4y',
                 'field29': 'quarter'};

df.rename(columns=cols_to_rename, inplace=True)

In [3]:
train, test = train_test_split(df.drop(['orderid', 'userid'], axis=1), stratify=df.goal1, random_state=SEED)

In [4]:
ytest = test.goal1
Xtest = test.drop('goal1', axis=1).copy()

In [5]:
def convert_to_ffm(df, type, numerics, categories, features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['goal1'])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                    if(catdict[x]==0):
                        datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                    else:
            
            # For a new field appearing in a training example
                         if(x not in catcodes):
                             catcodes[x] = {}
                             currentcode +=1
                             catcodes[x][datarow[x]] = currentcode #encoding the feature

            # For already encoded fields
                         elif(datarow[x] not in catcodes[x]):
                             currentcode +=1
                             catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                         code = catcodes[x][datarow[x]]
                         datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [6]:
cat_names = ['field5', 'field7', 'field8','field10', 'field19', 'year', 'indicator_goal21', 
             'indicator_goal22', 'indicator_goal23', 'indicator_goal24', 'indicator_goal25']

cont_names = ['days_from_last', 'order_of_buy', 'field1', 'field6', 'field12', 'field13', 'field14', 
              'total_tickets', 'days_before_depart', 'field17', 'field22', 'field25', 'field26', 
              'ticket_child_4y', 'ticket_child_1y', 'ticket_adult']

In [7]:
convert_to_ffm(df=train, type='train', numerics=cont_names, categories=cat_names, 
               features=train.drop('goal1', axis=1).columns.tolist())

convert_to_ffm(df=test, type='test', numerics=cont_names, categories=cat_names, 
               features=test.drop('goal1', axis=1).columns.tolist())

In [15]:
xl.hello()

In [17]:
ffm_model = xl.create_ffm()

ffm_model.setTrain("train_ffm.txt")
ffm_model.setValidate("test_ffm.txt")
ffm_model.setPreModel("model.out")
param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'auc', 'fold':5}

ffm_model.cv(param)

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

In [9]:
df_test = pd.read_csv('onetwotrip_challenge_test.csv')

In [10]:
df = downcast_dtypes(df_test)
del df_test
gc.collect()

cols_to_rename = {'field0': 'days_from_last',
                 'field2': 'month_buy',
                 'field3': 'month_depart',
                 'field4': 'order_of_buy',
                 'field9': 'ticket_child_1y',
                 'field11': 'hour_depart',
                 'field15': 'total_tickets',
                 'field16': 'days_before_depart',
                 'field18': 'dow_buy',
                 'field20': 'dow_depart',
                 'field21': 'year',
                 'field24': 'ticket_adult',
                 'field28': 'ticket_child_4y',
                 'field29': 'quarter'};

df.rename(columns=cols_to_rename, inplace=True)

In [11]:
cat_names = ['field5', 'field7', 'field8','field10', 'field19', 'year', 'indicator_goal21', 
             'indicator_goal22', 'indicator_goal23', 'indicator_goal24', 'indicator_goal25']

cont_names = ['days_from_last', 'order_of_buy', 'field1', 'field6', 'field12', 'field13', 'field14', 
              'total_tickets', 'days_before_depart', 'field17', 'field22', 'field25', 'field26', 
              'ticket_child_4y', 'ticket_child_1y', 'ticket_adult']

In [12]:
def convert_to_ffm(df, type, numerics, categories, features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            #datastring += str(int(datarow['goal1'])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                    if(catdict[x]==0):
                        datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                    else:
            
            # For a new field appearing in a training example
                         if(x not in catcodes):
                             catcodes[x] = {}
                             currentcode +=1
                             catcodes[x][datarow[x]] = currentcode #encoding the feature

            # For already encoded fields
                         elif(datarow[x] not in catcodes[x]):
                             currentcode +=1
                             catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                         code = catcodes[x][datarow[x]]
                         datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

In [13]:
convert_to_ffm(df=df, type='valid', numerics=cont_names, categories=cat_names, 
               features=df.columns.tolist())

In [18]:
# Prediction task
ffm_model.setTest("valid_ffm.txt") # Test data
ffm_model.setSigmoid() # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")