In [23]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [24]:
# Data reconfiguration function

def date_to_hours(date):
    '''Turn Date column into hours and fractional minutes'''
    hour = date[11:13]
    minute = date[14:16]
    frac = float(minute)/60.0
    return int(hour) + frac

def split_date(data):
    data['Year'] = data.Dates.apply(lambda x: int(x[0:4]))
    data['Month'] = data.Dates.apply(lambda x: int(x[5:7]))
    data['Day'] = data.Dates.apply(lambda x: int(x[8:10]))
    data['Time'] = data.Dates.apply(date_to_hours)
    
split_date(train)
split_date(test)

In [25]:
#Onehot encode District and Day of Week

def convertDow(data):
    data = pd.concat((data, pd.get_dummies(data.DayOfWeek, prefix="Dow")), axis=1)
    return data

def convertDistrict(data):
#     districts = {'MISSION' : 0, 'CENTRAL': 1, 'TARAVAL': 2, 'INGLESIDE': 3, 'TENDERLOIN': 4, 'BAYVIEW': 5, 
#                  'SOUTHERN': 6, 'NORTHERN': 7, 'PARK': 8, 'RICHMOND': 9}
#     return districts[x]
    data = pd.concat((data, pd.get_dummies(data.PdDistrict, prefix="Dis")), axis=1)
    return data

def convert(x):
    crimes = {
            'ARSON' : 0,'ASSAULT': 1,'BAD CHECKS': 2,'BRIBERY': 3,'BURGLARY': 4,'DISORDERLY CONDUCT': 5,'DRIVING UNDER THE INFLUENCE': 6,
             'DRUG/NARCOTIC': 7,'DRUNKENNESS': 8,'EMBEZZLEMENT': 9,'EXTORTION': 10,'FAMILY OFFENSES': 11,'FORGERY/COUNTERFEITING': 12,
             'FRAUD': 13,'GAMBLING': 14,'KIDNAPPING': 15,'LARCENY/THEFT': 16,'LIQUOR LAWS': 17,'LOITERING': 18,'MISSING PERSON': 19,'NON-CRIMINAL': 20,
             'OTHER OFFENSES': 21,'PORNOGRAPHY/OBSCENE MAT': 22,'PROSTITUTION': 23,'RECOVERED VEHICLE': 24,'ROBBERY': 25,'RUNAWAY': 26,'SECONDARY CODES': 27,
             'SEX OFFENSES FORCIBLE': 28,'SEX OFFENSES NON FORCIBLE': 29,'STOLEN PROPERTY': 30,'SUICIDE': 31,'SUSPICIOUS OCC': 32,'TREA': 33,'TRESPASS': 34,
             'VANDALISM': 35,'VEHICLE THEFT': 36,'WARRANTS':37,'WEAPON LAWS': 38}
    return crimes[x]

In [26]:
train = convertDow(train)
train = convertDistrict(train)

test = convertDow(test)
test = convertDistrict(test)


In [27]:
#Test if there's a street corner

def corner_test(string):
    if '/' in string: return 1
    else: return 0
    
train['Corner'] = train.Address.apply(corner_test)
test['Corner'] = test.Address.apply(corner_test)


# X = pd.DataFrame(train.drop('Category',1))
# y = pd.DataFrame(train['Category'])

# X = pd.DataFrame(X).applymap(convert)
# y = pd.DataFrame(y).applymap(convert)

# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .4)

In [28]:
# predictors = ['X', 'Y']
predictors = ['X', 'Y', 'Year', 'Month', 'Day', 'Time', 'Dow_Monday', 'Dow_Tuesday', 'Dow_Wednesday', 'Dow_Thursday', 'Dow_Friday', 'Dow_Saturday', 'Dow_Sunday', 'Dis_CENTRAL', 'Dis_INGLESIDE', 'Dis_MISSION', 'Dis_NORTHERN', 'Dis_RICHMOND', 'Dis_SOUTHERN', 'Dis_TARAVAL', 'Dis_TENDERLOIN', 'Corner']

In [33]:
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(n_estimators=100)

rf_model = alg.fit(train[predictors][0:10000], train['Category'][0:10000])

estimates = rf_model.predict_proba(test[predictors][0:10000])

df_est= pd.DataFrame(estimates)
dfest = pd.concat([test.Id[0:10000],df_est], axis=1)
print dfest

dfest.columns = ['Id','ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE',
             'DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING',
             'FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL',
             'OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES',
             'SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS',
             'VANDALISM','VEHICLE THEFT','WARRANTS','WEAPON LAWS']

dfest.to_csv('resultsRF.csv', index = False, columns=['Id','ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE',
             'DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING',
             'FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL',
             'OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES',
             'SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS',
             'VANDALISM','VEHICLE THEFT','WARRANTS','WEAPON LAWS'])

            Id     0         1   2         3     4         5         6  \
0            0  0.00  0.131667   0  0.037714  0.00  0.000000  0.004000   
1            1  0.00  0.067667   0  0.000000  0.00  0.040000  0.052000   
2            2  0.00  0.252214   0  0.010000  0.00  0.000000  0.010000   
3            3  0.00  0.065000   0  0.000000  0.00  0.000000  0.000000   
4            4  0.00  0.065000   0  0.000000  0.00  0.000000  0.000000   
5            5  0.00  0.047143   0  0.000000  0.00  0.022500  0.002000   
6            6  0.00  0.067500   0  0.049167  0.00  0.000000  0.000000   
7            7  0.00  0.052500   0  0.002500  0.00  0.000000  0.000000   
8            8  0.00  0.060000   0  0.030000  0.00  0.000000  0.006667   
9            9  0.00  0.113333   0  0.000000  0.01  0.000000  0.012000   
10          10  0.00  0.034167   0  0.025833  0.00  0.000000  0.000000   
11          11  0.00  0.115000   0  0.110000  0.00  0.000000  0.006250   
12          12  0.00  0.115000   0  0.

ValueError: Length mismatch: Expected axis has 36 elements, new values have 40 elements