In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math
import os
import pickle
import random
import sys
import time
from contextlib import suppress
from datetime import datetime

In [2]:
def transformX(dataset):
  dataset.loc[:, 'DayOfWeek'] = np.array([time.strptime(dow, '%A').tm_wday for dow in dataset.loc[:, 'DayOfWeek']], dtype=np.float)
  dataset.loc[:, 'Address'] = np.array([1 if 'block' in row else 0 for row in dataset.loc[:, 'Address']], dtype=np.float)
  # Districts
  districts = sorted(dataset.PdDistrict.unique())
  for d in districts:
    dataset.loc[:, d] = np.array([1 if d == x else 0 for x in dataset.loc[:, 'PdDistrict']], dtype=np.float)
  # Date
  datefmt = '%Y-%m-%d %H:%M:%S'
  def transform_time(tm):
    if tm.hour >= 0 and tm.hour <= 6:
      return 0
    elif tm.hour >= 6 and tm.hour <= 12:
      return 1
    elif tm.hour >= 12 and tm.hour <= 18:
      return 2
    return 3
  dataset.loc[:, 'Year'] = np.array([datetime.strptime(row, datefmt).year for row in dataset.loc[:, 'Dates']], dtype=np.float)
  dataset.loc[:, 'Month'] = np.array([datetime.strptime(row, datefmt).month for row in dataset.loc[:, 'Dates']], dtype=np.float)
  dataset.loc[:, 'Day'] = np.array([datetime.strptime(row, datefmt).day for row in dataset.loc[:, 'Dates']], dtype=np.float)
  dataset.loc[:, 'Hour'] = np.array([datetime.strptime(row, datefmt).time().hour for row in dataset.loc[:, 'Dates']], dtype=np.float)
  dataset.loc[:, 'Minute'] = np.array([datetime.strptime(row, datefmt).time().minute for row in dataset.loc[:, 'Dates']], dtype=np.float)
  dataset.loc[:, 'Period'] = np.array([transform_time(datetime.strptime(row, datefmt).time()) for row in dataset.loc[:, 'Dates']], dtype=np.float)
  return dataset.drop(columns=['PdDistrict', 'Dates'])

In [3]:
raw_train = pd.read_csv('train.csv')

X_train = raw_train[['X', 'Y', 'DayOfWeek', 'Address', 'PdDistrict', 'Dates']]
Y_train = raw_train['Category']
del raw_train

X_train = transformX(X_train)
Y_labels = sorted(Y_train.unique())
Y_train = np.array([Y_labels.index(cat) for cat in Y_train], dtype=np.float)


In [4]:
raw_test = pd.read_csv('test.csv')
X_test = raw_test[['X', 'Y', 'DayOfWeek', 'Address', 'PdDistrict', 'Dates']]
del raw_test

X_test = transformX(X_test)

In [5]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=40,min_samples_split=100, n_jobs=-1 )
rfc.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
y_pred_prob = rfc.predict_proba(X_test)

In [29]:
predictions = pd.DataFrame(y_pred_prob, columns=Y_labels)
predictions.index.name = 'Id'
now = time.time()
predictions.to_csv("predictions_{}.csv".format(int(now)))
