In [None]:
import pandas as pd

## Load Dataset

In [None]:
# 경로(ex: data/train.csv)는 사용자 설정마다 다를 수 있음
train = pd.read_csv("data/train.csv", parse_dates=["Dates"])

print(train.shape)
train.head()

In [None]:
test = pd.read_csv("data/test.csv", parse_dates=["Dates"])

print(test.shape)
test.head()

## Preprocessing

### Parse Dates

In [None]:
train["Dates-year"] = train["Dates"].dt.year
train["Dates-month"] = train["Dates"].dt.month
train["Dates-day"] = train["Dates"].dt.day
train["Dates-hour"] = train["Dates"].dt.hour
train["Dates-minute"] = train["Dates"].dt.minute
train["Dates-second"] = train["Dates"].dt.second

print(train.shape)
train[["Dates", "Dates-year", "Dates-month", "Dates-day", "Dates-hour", "Dates-minute", "Dates-second"]].head()

In [None]:
test["Dates-year"] = test["Dates"].dt.year
test["Dates-month"] = test["Dates"].dt.month
test["Dates-day"] = test["Dates"].dt.day
test["Dates-hour"] = test["Dates"].dt.hour
test["Dates-minute"] = test["Dates"].dt.minute
test["Dates-second"] = test["Dates"].dt.second

print(test.shape)
test[["Dates", "Dates-year", "Dates-month", "Dates-day", "Dates-hour", "Dates-minute", "Dates-second"]].head()

### Parse PdDistrict

In [None]:
pd_district_list = train["PdDistrict"].unique()

pd_disrict_column_list = []

for pd_district in pd_district_list:
    pd_disrict_column = "PdDistrict_" + pd_district
    pd_disrict_column_list.append(pd_disrict_column)

    train[pd_disrict_column] = train["PdDistrict"] == pd_district
    
print(train.shape)
train[pd_disrict_column_list].head()

In [None]:
for pd_district in pd_district_list:
    pd_disrict_column = "PdDistrict_" + pd_district
    test[pd_disrict_column] = test["PdDistrict"] == pd_district
    
print(test.shape)
test[pd_disrict_column_list].head()

In [None]:
import numpy as np

train["Dates-minute-abs"] = np.abs(train["Dates-minute"] - 30)

print(train.shape)
train[["Dates-minute", "Dates-minute-abs"]].head()

In [None]:
test["Dates-minute-abs"] = np.abs(test["Dates-minute"] - 30)

print(test.shape)
test[["Dates-minute", "Dates-minute-abs"]].head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
figure.set_size_inches(18, 4)

sns.countplot(data=train, x="Dates-minute", ax=ax1)
sns.countplot(data=train, x="Dates-minute-abs", ax=ax2)

### AddressType

In [None]:
train["CrossRoad"] = train["Address"].str.contains("/")

print(train.shape)
train[["Address", "CrossRoad"]].head()

In [None]:
test["CrossRoad"] = test["Address"].str.contains("/")

print(test.shape)
test[["Address", "CrossRoad"]].head()

## Train

In [None]:
feature_names = ["X", "Y", "Dates-hour", "Dates-minute-abs", "CrossRoad"]
feature_names = feature_names + pd_disrict_column_list
feature_names

In [None]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

In [None]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

In [None]:
label_name = "Category"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBClassifier(n_estimators=15,
                          nthread=4)
model

In [None]:
from sklearn.model_selection import cross_val_score

%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()
score = -1.0 * score

print("Score = {0:.5f}".format(score))

In [None]:
model = xgb.XGBClassifier(n_estimators=45,
                          max_depth=6,
                          learning_rate=1.0,
                          max_delta_step=1,
                          nthread=4,
                          seed=37)
model

In [None]:
from sklearn.model_selection import cross_val_score

%time score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_log_loss").mean()
score = -1.0 * score

print("Score = {0:.5f}".format(score))

In [None]:
%time model.fit(X_train, y_train)

In [None]:
predictions = model.predict_proba(X_test)

print(predictions.shape)
predictions

## Submit

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="Id")

print(submission.shape)
submission.head()

In [None]:
submission = pd.DataFrame(predictions,
                          index=submission.index,
                          columns=submission.columns)

print(submission.shape)
submission.head()

In [None]:
submission.to_csv("data/baseline-script.csv")