In [1]:
import math
import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split

eps = 1e-15

In [None]:
#loss function
def loss_function(y, pred):
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))

In [None]:
def getY(df, le):
    df_y = df[['VisitNumber', 'TripType']].groupby('VisitNumber').first()
    df_y = df_y.reset_index()
    df_y.set_index('VisitNumber', inplace=True, drop=True) 
    y = le.transform(df_y['TripType']) 
    return y

In [54]:
def getLabeled(col):
    col = col.fillna('-1')
    le = preprocessing.LabelEncoder()
    le.fit(col)
    labeled = le.transform(col) 
    return le, labeled

def getReverseLabled(col, le):
    return le.inverse_transform(col)

def getTopk(col, k=1000):
    col1 = col.dropna()
    counts = col1.value_counts()
    topk = counts.iloc[0:k].index
    top = col.apply(lambda x: x if x in topk else 'other') 
    return topk, top

In [None]:
def getTfIDF(col):
    count_vect = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1000)
    X_train_counts = count_vect.fit_transform(col)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    return X_train_tf

In [56]:
#submission
def generateSubmission(df_features_test, model, classlabel, prefix):
    y_test_pred = model.predict_proba(df_features_test)
    clumnNames = getReverseLabled(model.classes_, classlabel)
    tripClumn = []
    for i in clumnNames:
        tripClumn.append('TripType_' + str(i))
    
    sub_df = pd.DataFrame(y_test_pred, columns=tripClumn)
    sub_df.set_index(np.unique(df_features_test.index), inplace=True)
    sub_df.index.name = 'VisitNumber'

    millis = int(round(time.time() * 1000))
    filename = '%s_%d'%(prefix, millis)
    sub_df.to_csv(filename+'.csv')
    
    modelFile = open(filename+'.model', 'w')
    modelFile.write(str(model))

In [55]:
#get feature
def getFeaturesTrain(df):
    df_ScanCount = df[['VisitNumber', 'ScanCount']].groupby('VisitNumber').sum()
    df_ScanCount = df_ScanCount.reset_index()
    df_ScanCount.set_index('VisitNumber', inplace=True, drop=True)
    #weekday
    df_w = df[['VisitNumber', 'Weekday']].groupby('VisitNumber').first()
    df_w = df_w.reset_index()
    df_w.set_index('VisitNumber', inplace=True, drop=True)

    weekday = pd.get_dummies(df_w['Weekday'], prefix='Weekday')
    weekday['is_wknd'] = ((weekday['Weekday_Sunday'] + weekday['Weekday_Saturday']) > 0)
    weekday['is_wknd'] = weekday['is_wknd'].astype(int)
    del weekday['Weekday_Friday'] 
    
    #item count per visit
    df_count = df.groupby('VisitNumber').apply(lambda x: len(x))
    df_count.name = 'itemCount'
    
    df_features = pd.concat([df_ScanCount, weekday, df_count], axis=1) 

    return df_features

def getFeaturesTest(df):
    return getFeaturesTrain(df)


In [None]:
def processDescTrain(df, isUnique):
    df.DepartmentDescription = df.DepartmentDescription.fillna('')

    df_group = df[['VisitNumber', 'DepartmentDescription']].groupby('VisitNumber')
    df_list = None
    if isUnique:
        df_list = df_group.apply(lambda x: np.unique(list(x.DepartmentDescription)))
    else:
        df_list = df_group.apply(lambda x: list(x.DepartmentDescription))
        
    words = df_list.str.join(' ')

    vec = CountVectorizer() 
    vec.fit(words)    
    wc = vec.transform(words)
    word_count_vector = pd.DataFrame(wc.toarray(), index=df_list.index, columns = vec.get_feature_names())
    
    word_count_vector['words_count'] = words.apply(lambda x : len(x.split(' ')))
    word_count_vector['words_len'] = words.apply(lambda x : len(x))
    
    return vec, word_count_vector

def processDescTest(df, isUnique, vec):
    df.DepartmentDescription = df.DepartmentDescription.fillna('')

    df_group = df[['VisitNumber', 'DepartmentDescription']].groupby('VisitNumber')
    df_list = None
    if isUnique:
        df_list = df_group.apply(lambda x: np.unique(list(x.DepartmentDescription)))
    else:
        df_list = df_group.apply(lambda x: list(x.DepartmentDescription))
        
    words = df_list.str.join(' ')   
    wc = vec.transform(words)
    word_count_vector = pd.DataFrame(wc.toarray(), index=df_list.index, columns = vec.get_feature_names())
    
    word_count_vector['words_count'] = words.apply(lambda x : len(x.split(' ')))
    word_count_vector['words_len'] = words.apply(lambda x : len(x))
    
    return word_count_vector


In [None]:

def getCountVectorTrain(df_train,df_test, columnName, dictCount=1000):
    col1_train = df_train[columnName].dropna()
    counts_train = col1_train.value_counts()
    topk_dict_train = counts_train.index #counts_train.iloc[0:2000].index
    
    col1_test = df_test[columnName].dropna()
    counts_test = col1_test.value_counts()
    topk_dict_test = counts_test.iloc[0:min(2000, len(col1_test))].index
    
    topk_dict = set(topk_dict_train).union(set(topk_dict_test))
    
    topk_train = df_train[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName))
    
    topk_se = pd.Series(topk_train, name=columnName)
    df_topk = pd.concat([topk_se, df_train['VisitNumber']], axis=1)
    df_topk_gpy = df_topk.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    
    topk_flat = df_topk_list.str.join(' ')
    vec = CountVectorizer() 
    vec.fit(topk_flat)    
    wc = vec.transform(topk_flat)
    topk_vector = pd.DataFrame(wc.toarray(), index=df_topk_list.index, columns = vec.get_feature_names())

    return topk_dict, vec, topk_vector

def getCountVectorTest(df, columnName,topk_dict, vec):
    topk = df[columnName].apply(lambda x: '%s%d'%(columnName, x) if x in topk_dict else '%sother'%(columnName)) 
    
    topk_se = pd.Series(topk, name=columnName)
    df_topk = pd.concat([topk_se, df['VisitNumber']], axis=1)
    df_topk_gpy = df_topk.groupby('VisitNumber')
    df_topk_list = df_topk_gpy.apply(lambda x: list(x[columnName]))
    
    topk_flat = df_topk_list.str.join(' ') 
    wc = vec.transform(topk_flat)
    topk_vector = pd.DataFrame(wc.toarray(), index=df_topk_list.index, columns = vec.get_feature_names())
    return topk_vector
    

In [None]:
def generateTestTrainFeature(df_train, df_test):
    #generate train feature
    other_feature = getFeaturesTrain(df_train)
    vec_desc, desc_feature = processDescTrain(df_train, True)
    upc_dict, upc_vec, upc_feature = getCountVectorTrain(df_train,df_test, 'Upc')
    fileno_dict, fileno_vec, fileno_feature = getCountVectorTrain(df_train, df_test, 'FinelineNumber')
    df_features_train = pd.concat([other_feature, desc_feature, upc_feature, fileno_feature], axis=1) 
    
    #generate test feature
    other_feature_test = getFeaturesTest(df_test)
    desc_feature_test = processDescTest(df_test, True, vec_desc)
    upc_feature_test = getCountVectorTest(df_test, 'Upc', upc_dict, upc_vec)
    fileno_feature_test = getCountVectorTest(df_test, 'FinelineNumber', fileno_dict, fileno_vec)
    df_features_test = pd.concat([other_feature_test, desc_feature_test, upc_feature_test, fileno_feature_test], axis=1) 

    return df_features_train, df_features_test 