In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import re
import optuna

In [28]:
train = pd.read_csv(r"C:\Users\raven\Downloads\sf-crime\train.csv\train.csv")
test  = pd.read_csv(r"C:\Users\raven\Downloads\sf-crime\test.csv\test.csv")

In [29]:
train1 = train.drop(['Descript', 'Resolution'], axis = 1)
train1.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541


In [30]:
train1 = train1.query('Y != 90')

In [31]:
Dt = pd.to_datetime(train1['Dates'])
train1['Date'] = Dt.dt.date
train1['Year'] = Dt.dt.year
train1['Month'] = Dt.dt.month
train1['Day'] = Dt.dt.day
train1['Hour'] = Dt.dt.hour
train1.drop(["Dates"], axis = 1, inplace = True)

le = LabelEncoder()
train1['DayOfWeek'] = le.fit_transform(train1['DayOfWeek'])

count_encoding = train1['PdDistrict'].value_counts().to_dict()
train1['PdDistrict'] = train1['PdDistrict'].map(count_encoding)

# Ad_dict = {'.*?ST / .*?ST':0, '.*?AV / .*?AV':1, '.*?ST / .*?AV':2, '.*?AV / .*?ST':3, '.*?ST':4, '.*?AV':5}
# train1['Address'] = train1['Address'].replace(Ad_dict,regex = True)
# train1['Address'].unique()
Ad_dict = {'.* / .*':0, '.*':1}
train1['Address'] = train1['Address'].replace(Ad_dict,regex = True)

train1['Category'] = le.fit_transform(train1['Category'])

train1.head()
# # DayOfWeek...label, PdDistrict...count

Unnamed: 0,Category,DayOfWeek,PdDistrict,Address,X,Y,Date,Year,Month,Day,Hour
0,37,6,105284,0,-122.425892,37.774599,2015-05-13,2015,5,13,23
1,21,6,105284,0,-122.425892,37.774599,2015-05-13,2015,5,13,23
2,21,6,105284,0,-122.424363,37.800414,2015-05-13,2015,5,13,23
3,16,6,105284,1,-122.426995,37.800873,2015-05-13,2015,5,13,23
4,16,6,49311,1,-122.438738,37.771541,2015-05-13,2015,5,13,23


In [32]:
Dt = pd.to_datetime(test['Dates'])
test['Date'] = Dt.dt.date
test['Year'] = Dt.dt.year
test['Month'] = Dt.dt.month
test['Day'] = Dt.dt.day
test['Hour'] = Dt.dt.hour
test.drop(["Dates"], axis = 1, inplace = True)

le = LabelEncoder()
test['DayOfWeek'] = le.fit_transform(test['DayOfWeek'])

count_encoding = test['PdDistrict'].value_counts().to_dict()
test['PdDistrict'] = test['PdDistrict'].map(count_encoding)

Ad_dict = {'.* / .*':0, '.*':1}
test['Address'] = test['Address'].replace(Ad_dict,regex = True)
test.head()

Unnamed: 0,Id,DayOfWeek,PdDistrict,Address,X,Y,Date,Year,Month,Day,Hour
0,0,3,89591,1,-122.399588,37.735051,2015-05-10,2015,5,10,23
1,1,3,89591,0,-122.391523,37.732432,2015-05-10,2015,5,10,23
2,2,3,107017,1,-122.426002,37.792212,2015-05-10,2015,5,10,23
3,3,3,80084,1,-122.437394,37.721412,2015-05-10,2015,5,10,23
4,4,3,80084,1,-122.437394,37.721412,2015-05-10,2015,5,10,23


In [33]:
cols = ['DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'Year', 'Month', 'Day', 'Hour']

X = train1[cols]
Y = train1['Category']

X_test = test[cols]

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size = 0.3, random_state = 1219)

In [56]:
# optunaの目的関数を設定する
def objective(trial):
    # criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    criterion = "gini"
    bootstrap = True  # trial.suggest_categorical('bootstrap',[True,False])
    max_depth = trial.suggest_int('max_depth', 1, 2)
    max_features = None  # trial.suggest_categorical('max_features', ['sqrt','log2',None])
    n_estimators = 5  # trial.suggest_int('n_estimators', 1, 2)
    min_samples_split = trial.suggest_int('min_samples_split',2,5)
    max_leaf_nodes = None
    min_samples_leaf = 1

    rfcpt = RFC(bootstrap = bootstrap, criterion = criterion,
                                 max_depth = max_depth, max_features = max_features,
                                 max_leaf_nodes = max_leaf_nodes,
                                 n_estimators = n_estimators,
                                 min_samples_split = min_samples_split,min_samples_leaf = min_samples_leaf)
    score = cross_val_score(rfcpt, X_train, y_train, cv=5, scoring="r2")
    r2_mean = score.mean()
    print(r2_mean)

    return r2_mean

In [57]:
#optunaで学習
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=True)


[32m[I 2023-02-08 18:55:48,931][0m A new study created in memory with name: no-name-1586c5c3-5a85-4dab-8a55-111395501462[0m
  self._init_valid()


  0%|          | 0/5 [00:00<?, ?it/s]

-0.04869113844568758
[32m[I 2023-02-08 18:56:02,324][0m Trial 0 finished with value: -0.04869113844568758 and parameters: {'max_depth': 1, 'min_samples_split': 2}. Best is trial 0 with value: -0.04869113844568758.[0m
-0.07250007617576734
[32m[I 2023-02-08 18:56:22,831][0m Trial 1 finished with value: -0.07250007617576734 and parameters: {'max_depth': 2, 'min_samples_split': 5}. Best is trial 0 with value: -0.04869113844568758.[0m
-0.04869113844568758
[32m[I 2023-02-08 18:56:35,645][0m Trial 2 finished with value: -0.04869113844568758 and parameters: {'max_depth': 1, 'min_samples_split': 2}. Best is trial 0 with value: -0.04869113844568758.[0m
-0.07090418215717058
[32m[I 2023-02-08 18:56:56,121][0m Trial 3 finished with value: -0.07090418215717058 and parameters: {'max_depth': 2, 'min_samples_split': 3}. Best is trial 0 with value: -0.04869113844568758.[0m
-0.04869113844568758
[32m[I 2023-02-08 18:57:08,932][0m Trial 4 finished with value: -0.04869113844568758 and paramete

In [58]:

# チューニングしたハイパーパラメーターをフィット
optimised_rf = RFC(bootstrap = study.best_params['bootstrap'], criterion = study.best_params['criterion'],
                                     max_depth = study.best_params['max_depth'], max_features = study.best_params['max_features'],
                                     max_leaf_nodes = None, n_estimators = study.best_params['n_estimators'],
                                     min_samples_split = study.best_params['min_samples_split'],min_samples_leaf = study.best_params['min_samples_leaf'])
optimised_rf.fit(X_train ,y_train)


KeyError: 'bootstrap'

In [29]:
# rf = RFC(n_jobs=-1,          # 複数のCPUコアを使って並列に学習します。-1は最大値。
#           random_state=2525)  # 乱数のシードです。
rf = RFC(n_estimators=60, max_depth=32, random_state=1219)

rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=32, n_estimators=60, random_state=1219)

In [30]:
y_pred = rf.predict(X_val)

In [31]:
# y_val.reshape(-1,1)

In [32]:
# print(f"acc: {rf.score(y_val, y_pred)}")

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

0.30057897834051517

In [34]:
rf_submission = rf.predict(X_test)

In [39]:
import csv
pred = rf.predict_proba(test[['DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y', 'Year', 'Month', 'Day', 'Hour']])
submission = pd.DataFrame(columns=['Id'], data=test)
submission = pd.concat([submission, pd.DataFrame(pred, columns=["ARSON", "ASSAULT", "BAD CHECKS", "BRIBERY", "BURGLARY", "DISORDERLY CONDUCT",
"DRIVING UNDER THE INFLUENCE", "DRUG/NARCOTIC", "DRUNKENNESS", "EMBEZZLEMENT", "EXTORTION",
"FAMILY OFFENSES", "FORGERY/COUNTERFEITING", "FRAUD", "GAMBLING", "KIDNAPPING", "LARCENY/THEFT",
"LIQUOR LAWS", "LOITERING", "MISSING PERSON", "NON-CRIMINAL", "OTHER OFFENSES", "PORNOGRAPHY/OBSCENE MAT",
"PROSTITUTION", "RECOVERED VEHICLE", "ROBBERY", "RUNAWAY", "SECONDARY CODES", "SEX OFFENSES FORCIBLE",
"SEX OFFENSES NON FORCIBLE", "STOLEN PROPERTY", "SUICIDE", "SUSPICIOUS OCC", "TREA", "TRESPASS",
"VANDALISM", "VEHICLE THEFT", "WARRANTS", "WEAPON LAWS"])], axis=1)

In [40]:
submission.to_csv("rf1.csv", index=False)