# [San Francisco Crime Classification](https://www.kaggle.com/c/sf-crime)

### 사용되는 파일
* **_input/test.csv_** - Test Set
* **_input/train.csv_** - Train Set
* **_input/sampleSubmission.csv_** - Kaggle 제출용

이 데이터셋은 SFPD 범죄 사건 리포팅 시스템에서 유래된 사건이 포함되어 있다. 이 데이터 범위는 2003/1/1부터 2015/5/13까지이다. Test Set과 Training Set은 매주 회전합니다.(데이터가 격주로 존재한다는 말인듯) 1,3,5,7주는 테스트셋이고 2,4,6,8은 Training Set입니다.

### Data fields
* **_Dates_** - 범죄사건의 timestamp
* **_Category_** - 범죄사건의 카테고리(only train.csv). 이것은 예측해야하는 목표 변수(종속 변수)
* **_Descript_** - 범죄사건의 자세한 설명 (only train.csv)
* **_DayOfWeek_** - 요일
* **_PdDistrict_** - 관할 경찰서
* **_Resolution_** - 범죄사건이 어떻게 해결됐는지 (only train.csv
* **_Address_** - 범죄 사건이 발생한 주소
* **_X_** - 경도
* **_Y_** - 위도

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("input/train.csv")
print(train.shape)
train.head()

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015.5.13 23:53,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015.5.13 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015.5.13 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015.5.13 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015.5.13 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
test = pd.read_csv("input/test.csv")
print(test.shape)
test.head()

def roundXY(x):
    x = round(x, 2)
    return x

test["X"] = test["X"].apply(roundXY)
test["Y"] = test["Y"].apply(roundXY)

train["X"] = train["X"].apply(roundXY)
train["Y"] = train["Y"].apply(roundXY)

train['street_corner'] = train['Address'].apply(lambda x: 1 if '/' in x else 0)
test['street_corner'] = test['Address'].apply(lambda x: 1 if '/' in x else 0)

(884262, 7)


## Train

In [4]:
feature_names = ["X", "Y"]
feature_names

['X', 'Y']

In [5]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(878049, 2)


Unnamed: 0,X,Y
0,-122.43,37.77
1,-122.43,37.77
2,-122.42,37.8
3,-122.43,37.8
4,-122.44,37.77


In [6]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(884262, 2)


Unnamed: 0,X,Y
0,-122.4,37.74
1,-122.39,37.73
2,-122.43,37.79
3,-122.44,37.72
4,-122.44,37.72


In [7]:
label_name = "Category"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(878049,)


0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
3     LARCENY/THEFT
4     LARCENY/THEFT
Name: Category, dtype: object

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=-1)
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
from sklearn.model_selection import cross_val_score

%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()

score = -1.0 * score

print("Score = {0:.5f}".format(score))

CPU times: user 59.5 s, sys: 3.28 s, total: 1min 2s
Wall time: 34.5 s
Score = 2.60278


In [10]:
import xgboost as xgb

model = xgb.XGBClassifier(n_estimators=15, nthread=4)
model



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=15, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [11]:
%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()

score = -1.0 * score

print("Score = {0:.5f}".format(score))

CPU times: user 7min 29s, sys: 2.95 s, total: 7min 32s
Wall time: 7min 35s
Score = 2.83976


In [12]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=15, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [13]:
predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions

(884262, 39)


array([[ 0.01309049,  0.06858905,  0.01161566, ...,  0.06114429,
         0.03573009,  0.02019808],
       [ 0.01351147,  0.0766116 ,  0.01221747, ...,  0.05049963,
         0.03758131,  0.02124456],
       [ 0.01217875,  0.05380455,  0.0118699 , ...,  0.05204831,
         0.02213458,  0.01367625],
       ..., 
       [ 0.01246527,  0.06929345,  0.01162627, ...,  0.06456899,
         0.02759446,  0.01685168],
       [ 0.01322596,  0.07061828,  0.01195931, ...,  0.04943253,
         0.03678719,  0.02079565],
       [ 0.01253004,  0.05055364,  0.01168668, ...,  0.06413946,
         0.02315977,  0.01606383]], dtype=float32)

## Submit 

In [14]:
submission = pd.read_csv("input/sampleSubmission.csv", index_col="Id")

print(submission.shape)
submission.head()

(884262, 39)


Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
submission = pd.DataFrame(predictions, index=submission.index, columns=submission.columns)
print(submission.shape)
submission.head()

(884262, 39)


Unnamed: 0_level_0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.01309,0.068589,0.011616,0.011694,0.031514,0.012469,0.012489,0.037156,0.012672,0.011982,...,0.011509,0.013608,0.011708,0.033026,0.011417,0.01394,0.041967,0.061144,0.03573,0.020198
1,0.013511,0.076612,0.012217,0.0123,0.033146,0.013115,0.013136,0.040131,0.013797,0.012603,...,0.012105,0.014314,0.012315,0.035138,0.012008,0.014663,0.040267,0.0505,0.037581,0.021245
2,0.012179,0.053805,0.01187,0.011686,0.06435,0.012813,0.012465,0.021825,0.01335,0.012196,...,0.01166,0.014823,0.011818,0.028912,0.011583,0.014812,0.041234,0.052048,0.022135,0.013676
3,0.012581,0.067573,0.011734,0.011813,0.03197,0.012597,0.012664,0.026468,0.012802,0.012071,...,0.011626,0.014472,0.011828,0.032978,0.011533,0.014106,0.046567,0.073316,0.027851,0.017008
4,0.012581,0.067573,0.011734,0.011813,0.03197,0.012597,0.012664,0.026468,0.012802,0.012071,...,0.011626,0.014472,0.011828,0.032978,0.011533,0.014106,0.046567,0.073316,0.027851,0.017008


In [16]:
submission.to_csv("output/baseline-script.csv")