# San Francisco Crime Classification

Predict the category of crimes that occurred in the city by the bay

Competition on [Kaggle](https://www.kaggle.com/c/sf-crime)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print 'Train shape: {}'.format(train.shape)
print 'Test shape: {}'.format(test.shape)

Train shape: (878049, 9)
Test shape: (884262, 7)


In [3]:
label = train['Category']
del train['Category']

# Remove column that is not in test data
del train['Descript']
train.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
test_id = test['Id']
del test['Id']
test.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
# Remove lat and lon for now
del train['X']
del train['Y']
del test['X']
del test['Y']

In [6]:
total = pd.concat([train, test], ignore_index=True)
total.head()

Unnamed: 0,Address,Dates,DayOfWeek,PdDistrict,Resolution
0,OAK ST / LAGUNA ST,2015-05-13 23:53:00,Wednesday,NORTHERN,"ARREST, BOOKED"
1,OAK ST / LAGUNA ST,2015-05-13 23:53:00,Wednesday,NORTHERN,"ARREST, BOOKED"
2,VANNESS AV / GREENWICH ST,2015-05-13 23:33:00,Wednesday,NORTHERN,"ARREST, BOOKED"
3,1500 Block of LOMBARD ST,2015-05-13 23:30:00,Wednesday,NORTHERN,NONE
4,100 Block of BRODERICK ST,2015-05-13 23:30:00,Wednesday,PARK,NONE


In [7]:
# Convert Dates feature to year, month, hour
total['Dates'] = pd.to_datetime(total.Dates)
total['Year'] = total.Dates.dt.year.astype(str)
total['Month'] = total.Dates.dt.month.astype(str)
total['Hour'] = total.Dates.dt.hour.astype(str)
del total['Dates']

In [8]:
for c in total.columns:
    num_unique = len(total[c].unique())
    print 'Column: {}'.format(c)
    print 'Number of unique values: {}\n'.format(num_unique)

Column: Address
Number of unique values: 24777

Column: DayOfWeek
Number of unique values: 7

Column: PdDistrict
Number of unique values: 10

Column: Resolution
Number of unique values: 18

Column: Year
Number of unique values: 13

Column: Month
Number of unique values: 12

Column: Hour
Number of unique values: 24



In [9]:
del total['Address']

In [10]:
total.head()

Unnamed: 0,DayOfWeek,PdDistrict,Resolution,Year,Month,Hour
0,Wednesday,NORTHERN,"ARREST, BOOKED",2015,5,23
1,Wednesday,NORTHERN,"ARREST, BOOKED",2015,5,23
2,Wednesday,NORTHERN,"ARREST, BOOKED",2015,5,23
3,Wednesday,NORTHERN,NONE,2015,5,23
4,Wednesday,PARK,NONE,2015,5,23


In [11]:
dummies = pd.get_dummies(total)

In [12]:
dummies.shape

(1762311, 83)

In [13]:
train_dummies = dummies.head(len(train))
test_dummies = dummies.tail(len(test))

In [21]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(train_dummies, label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
preds = clf.predict_proba(test_dummies)

In [24]:
preds.shape

(884262, 39)

In [25]:
sample = pd.read_csv('data/sampleSubmission.csv')
sample.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [39]:
df_preds = pd.DataFrame(preds)
df_preds.columns = clf.classes_
df_preds.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.007869,0.171707,3.5e-05,0.002723,0.010409,0.00186,0.002257,0.008423,0.004079,0.000243,...,0.000928,0.004033,0.000245,0.036235,0.000175,0.005435,0.042485,0.017915,0.008692,0.038056
1,0.007869,0.171707,3.5e-05,0.002723,0.010409,0.00186,0.002257,0.008423,0.004079,0.000243,...,0.000928,0.004033,0.000245,0.036235,0.000175,0.005435,0.042485,0.017915,0.008692,0.038056
2,0.002625,0.13987,5.2e-05,0.000955,0.014468,0.004079,0.002799,0.009744,0.006025,0.000317,...,0.000477,0.007932,0.000382,0.027541,0.000104,0.007812,0.037597,0.012848,0.010273,0.019288
3,0.003902,0.171638,3.4e-05,0.003196,0.009628,0.001877,0.0033,0.006055,0.004211,0.000203,...,0.001064,0.004858,0.000436,0.030798,6.5e-05,0.004741,0.045419,0.024313,0.007137,0.032751
4,0.003902,0.171638,3.4e-05,0.003196,0.009628,0.001877,0.0033,0.006055,0.004211,0.000203,...,0.001064,0.004858,0.000436,0.030798,6.5e-05,0.004741,0.045419,0.024313,0.007137,0.032751


In [40]:
id_list = range(0, len(preds))
df_id = pd.DataFrame({'Id': id_list})
df_id.head()

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4


In [42]:
df = pd.concat([df_id, df_preds], axis=1)
df.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.007869,0.171707,3.5e-05,0.002723,0.010409,0.00186,0.002257,0.008423,0.004079,...,0.000928,0.004033,0.000245,0.036235,0.000175,0.005435,0.042485,0.017915,0.008692,0.038056
1,1,0.007869,0.171707,3.5e-05,0.002723,0.010409,0.00186,0.002257,0.008423,0.004079,...,0.000928,0.004033,0.000245,0.036235,0.000175,0.005435,0.042485,0.017915,0.008692,0.038056
2,2,0.002625,0.13987,5.2e-05,0.000955,0.014468,0.004079,0.002799,0.009744,0.006025,...,0.000477,0.007932,0.000382,0.027541,0.000104,0.007812,0.037597,0.012848,0.010273,0.019288
3,3,0.003902,0.171638,3.4e-05,0.003196,0.009628,0.001877,0.0033,0.006055,0.004211,...,0.001064,0.004858,0.000436,0.030798,6.5e-05,0.004741,0.045419,0.024313,0.007137,0.032751
4,4,0.003902,0.171638,3.4e-05,0.003196,0.009628,0.001877,0.0033,0.006055,0.004211,...,0.001064,0.004858,0.000436,0.030798,6.5e-05,0.004741,0.045419,0.024313,0.007137,0.032751


In [43]:
sub_number = 1
df.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)