In [1]:
import math
import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split

eps = 1e-15

In [None]:
#loss function
def loss_function(y, pred):
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

In [None]:
def getY(df, le):
    y = le.transform(df_y['TripType']) 
    return y

In [54]:
def getLabeled(col):
    col = col.fillna('-1')
    le = preprocessing.LabelEncoder()
    le.fit(col)
    labeled = le.transform(col) 
    return le, labeled

def getReverseLabled(col, le):
    return le.inverse_transform(col)


In [56]:
#submission
def generateSubmission(df_features_test, model, classlabel, prefix):
    y_test_pred = model.predict_proba(df_features_test)
    clumnNames = getReverseLabled(model.classes_, classlabel)
    tripClumn = []
    for i in clumnNames:
        tripClumn.append('TripType_' + str(i))
    
    sub_df = pd.DataFrame(y_test_pred, columns=tripClumn)
    sub_df.set_index(df_features_test.index, inplace=True)
    sub_df.index.name = 'VisitNumber'

    df_count = sub_df.groupby('VisitNumber').mean()
    df_count = df_count.reset_index()
    df_count.set_index('VisitNumber', inplace=True, drop=True)
    
    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    df_count.to_csv(filename+'.csv')
    
    modelFile = open(filename+'.model', 'w')
    modelFile.write(str(model))

In [55]:
#get feature
def getFeaturesTrain(df):
    df_ScanCount = df[['VisitNumber', 'ScanCount']]
    df_ScanCount.set_index('VisitNumber', inplace=True, drop=True)
    #weekday
    df_w = df[['VisitNumber', 'Weekday']]
    df_w.set_index('VisitNumber', inplace=True, drop=True)

    weekday = pd.get_dummies(df_w['Weekday'], prefix='Weekday')
    weekday['is_wknd'] = ((weekday['Weekday_Sunday'] + weekday['Weekday_Saturday']) > 0)
    weekday['is_wknd'] = weekday['is_wknd'].astype(int)
    del weekday['Weekday_Friday'] 
    
    df_features = pd.concat([df_ScanCount, weekday], axis=1) 

    return df_features

def getFeaturesTest(df):
    return getFeaturesTrain(df)


In [None]:
def processDescTrain(df, isUnique, vec=None):
    df.DepartmentDescription = df.DepartmentDescription.fillna('')

    isTrain = False
    if not vec:
        isTrain = True
        vec = CountVectorizer() 
        vec.fit(df.DepartmentDescription)    
    wc = vec.transform(df.DepartmentDescription)
    word_count_vector = pd.DataFrame(wc.toarray(), index=df.index, columns = vec.get_feature_names())
    
    word_count_vector['words_count'] = df.DepartmentDescription.apply(lambda x : len(x.split(' ')))
    word_count_vector['words_len'] = df.DepartmentDescription.apply(lambda x : len(x))
    if isTrain:
        return vec, word_count_vector
    else:
        return word_count_vector

In [None]:

def getCountVectorTrain(df_train,df_test, columnName, dictCount=1000):
    col1_train = df_train[columnName].dropna()
    counts_train = col1_train.value_counts()
    topk_dict_train = counts_train.iloc[0:min(dictCount, len(col1_train))].index
    
    col1_test = df_test[columnName].dropna()
    counts_test = col1_test.value_counts()
    topk_dict_test = counts_test.iloc[0:min(dictCount, len(col1_test))].index
    
    topk_dict = set(topk_dict_train).union(set(topk_dict_test))
    
    topk_train = df_train[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
    
    vec = CountVectorizer() 
    vec.fit(topk_train)    
    wc = vec.transform(topk_train)
    topk_vector = pd.DataFrame(wc.toarray(), index=df_train['VisitNumber'], columns = vec.get_feature_names())

    return topk_dict, vec, topk_vector

def getCountVectorTest(df, columnName,topk_dict, vec):
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName)) 
    
    wc = vec.transform(topk)
    topk_vector = pd.DataFrame(wc.toarray(), index=df['VisitNumber'], columns = vec.get_feature_names())
    return topk_vector
    

In [None]:
def generateTestTrainFeature(df_train, df_test):
    #generate train feature
    other_feature = getFeaturesTrain(df_train)
    vec_desc, desc_feature = processDescTrain(df_train, True)
    upc_dict, upc_vec, upc_feature = getCountVectorTrain(df_train,df_test, 'Upc')
    fileno_dict, fileno_vec, fileno_feature = getCountVectorTrain(df_train, df_test, 'FinelineNumber')
    df_features_train = pd.concat([other_feature, desc_feature, upc_feature, fileno_feature], axis=1) 
    
    #generate test feature
    other_feature_test = getFeaturesTest(df_test)
    desc_feature_test = processDescTest(df_test, True, vec_desc)
    upc_feature_test = getCountVectorTest(df_test, 'Upc', upc_dict, upc_vec)
    fileno_feature_test = getCountVectorTest(df_test, 'FinelineNumber', fileno_dict, fileno_vec)
    df_features_test = pd.concat([other_feature_test, desc_feature_test, upc_feature_test, fileno_feature_test], axis=1) 

    return df_features_train, df_features_test 