In [6]:
import math
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import pickle
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
import time

eps = 1e-15

%run 'methods.ipynb'

In [3]:
def loss_finction(y, pred):
    total = 0.
    for i in range(len(y)):
        p = max(min(pred[i][y[i]], (1 - eps)), eps)
        total += math.log(p)
    return -(total/len(y))
    

In [4]:
#prepare data
df_train = pd.read_csv('train.csv')

In [7]:
df_y = df_train[['VisitNumber', 'TripType']].groupby('VisitNumber').first()

le, y = getLabeled(df_y['TripType'])
#pickle.dump( y, open( "y.p", "wb" ) )


In [8]:
#process description 
df_train['DepartmentDescription'] = df_train['DepartmentDescription'].fillna('')
df_words = df_train.groupby('VisitNumber')['DepartmentDescription']\
            .apply(lambda x: '{%s}' % ' '.join(x))
  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = None) 

vectorizer.fit(df_words)

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
train_data_features = vectorizer.transform(df_words)

In [None]:

df_words1 = df_words.reset_index()
df_words1.set_index('VisitNumber', inplace=True, drop=True)


word_count = [len(str(x).strip("{}").split()) for x in df_words1['DepartmentDescription']]
word_len = [len(str(x).strip("{}")) for x in df_words1['DepartmentDescription']]

df_words1['word_count'] = pd.Series(word_count, index=df_words1.index)
df_words1['word_len'] = pd.Series(word_len, index=df_words1.index)
df_words1['word_feature'] = pd.Series(train_data_features, index=df_words1.index)

df_words1.head()

In [10]:
print train_data_features

  (0, 37)	1
  (0, 94)	1
  (1, 19)	1
  (1, 82)	1
  (1, 97)	1
  (2, 3)	16
  (2, 5)	18
  (2, 21)	1
  (2, 28)	1
  (2, 34)	1
  (2, 41)	1
  (2, 42)	1
  (2, 48)	1
  (2, 55)	1
  (2, 69)	1
  (2, 80)	16
  (2, 83)	2
  (2, 104)	1
  (2, 105)	2
  (3, 57)	1
  (3, 73)	1
  (3, 89)	2
  (4, 18)	1
  (4, 26)	1
  (4, 34)	2
  :	:
  (95671, 48)	5
  (95671, 55)	3
  (95671, 76)	1
  (95671, 81)	2
  (95671, 84)	1
  (95671, 104)	1
  (95672, 21)	2
  (95672, 28)	1
  (95672, 33)	5
  (95672, 34)	3
  (95672, 38)	1
  (95672, 41)	1
  (95672, 42)	2
  (95672, 47)	5
  (95672, 48)	8
  (95672, 55)	2
  (95672, 69)	1
  (95672, 76)	2
  (95672, 84)	2
  (95672, 89)	2
  (95672, 104)	2
  (95673, 28)	1
  (95673, 33)	1
  (95673, 47)	1
  (95673, 48)	1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data_features, y, test_size=0.33, random_state=0)
forest = RandomForestClassifier(max_depth=2, n_estimators=100, random_state=0)
forest = forest.fit(X_train , y_train )

In [None]:
y_pred = forest.predict_proba(X_test)
loss_finction(y_test, y_pred)

In [None]:
kf= KFold(len(y), n_folds=5, shuffle=True, random_state=None)

for n_estimator in [50]:#, 300]:#[50, 100, 200]:
    for max_depth in [15]:#[13,15]: #[9, 11]:#[2,3,5,7, 9, 11]:
        avg = 0
        for train_index, test_index in kf:
            X_train, X_test = train_data_features[train_index], train_data_features[test_index]
            y_train, y_test = y[train_index], y[test_index]
            forest = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimator, random_state=0) 
            forest = forest.fit(X_train , y_train )
            y_pred = forest.predict_proba(X_test)
            avg += loss_finction(y_test, y_pred)
        
        print n_estimator, max_depth,"=>", avg/len(kf)


In [None]:
#prepare submission
df_test = pd.read_csv('test.csv')
df_test['DepartmentDescription'] = df_test['DepartmentDescription'].fillna('')
df_words_test = df_test.groupby('VisitNumber')['DepartmentDescription']\
            .apply(lambda x: '{%s}' % ', '.join(x))
    
test_data_features = vectorizer.transform(df_words_test)

forest = RandomForestClassifier(max_depth=15, n_estimators=50, random_state=0) 
forest.fit(train_data_features, y)

y_test_pred = forest.predict_proba(test_data_features)

In [None]:
submission=pd.read_csv('sample_submission.csv',index_col=0)
names = submission.columns.values
sub_df = pd.DataFrame(y_test_pred, columns=names)
sub_df.set_index(np.unique(df_test['VisitNumber']), inplace=True)
sub_df.index.name = 'VisitNumber'
#print sub_df.head()

millis = int(round(time.time() * 1000))
filename = 'rf_desc_sub1%d.csv'%(millis)
sub_df.to_csv(filename)

In [None]:
def dumpPredY(filename, index_col, y_pred):
    submission=pd.read_csv('sample_submission.csv',index_col=0)
    names = submission.columns.values
    sub_df = pd.DataFrame(y_pred, columns=names)
    sub_df.set_index(index_col, inplace=True)
    sub_df.index.name = 'VisitNumber'
    pickle.dump( sub_df, open( filename, "wb" ) )
    #sub_df.to_csv(filename)

In [None]:
y_train_pred = forest.predict_proba(train_data_features)

dumpPredY('test_features.p', np.unique(df_test['VisitNumber']), y_test_pred)
dumpPredY('train_features.p', df_words1.index, y_train_pred)

