In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

years = [2022, 2021, 2020]
# Number of folds
n_folds = 3
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)



# Initialize an empty list to store DataFrames
dfs = []

# Loop through each year, read the corresponding file and append to the list
for year in years:
    file_path = f'cleaned_datasets/Cleaned_Crimes_{year}.csv'  # Adjust the file path as needed
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into one
combined_df = pd.concat(dfs, ignore_index=True)

# Parse the date column to extract day, month, and year
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df['Day'] = combined_df['Date'].dt.day
combined_df['Month'] = combined_df['Date'].dt.month
combined_df['Year'] = combined_df['Date'].dt.year

combined_df['Location Description'], _ = pd.factorize(combined_df['Location Description'])
combined_df['Primary Type'], _ = pd.factorize(combined_df['Primary Type'])

# Selecting relevant columns
features = ['Community Area', 'Location Description', 'Day', 'Month', 'Year']
target = 'Primary Type'

# Split the dataset
X = combined_df[features]
y = combined_df[target]

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

# Initialize and fit the grid search
mlp = MLPClassifier(max_iter=1000)
grid_search = GridSearchCV(mlp, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Output the best parameters and the corresponding accuracy
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Test set accuracy: ", accuracy)


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time= 5.2min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time= 6.7min
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time= 8.0min
Best parameters found:  {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
Best accuracy found:  0.29502595974303386
