# CS 584 Project

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

years = [2022, 2021, 2020]

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each year, read the corresponding file and append to the list
for year in years:
    file_path = f'/cleaned_datasets/Cleaned_Crimes_{year}.csv'  # Adjust the file path as needed
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into one
combined_df = pd.concat(dfs, ignore_index=True)

# Drop all rows with any missing values
df = df.dropna()

# Convert 'Arrest' and 'Domestic' to numeric if they're not already
df['Arrest'] = pd.to_numeric(df['Arrest'], errors='coerce')
df['Domestic'] = pd.to_numeric(df['Domestic'], errors='coerce')

# Encoding categorical variables
le_primary_type = LabelEncoder()
df['Primary Type'] = le_primary_type.fit_transform(df['Primary Type'])

le_location_desc = LabelEncoder()
df['Location Description'] = le_location_desc.fit_transform(df['Location Description'])

# Selecting relevant columns
features = ['Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 
            'Ward', 'Community Area', 'X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude']
target = 'Primary Type'

# Split the dataset
X = df[features]
y = df[target]

# Scale the features to avoid negative values and help with convergence
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model training and evaluation
models = {
    #'Logistic Regression': LogisticRegression(max_iter=10000),
    #'Decision Tree': DecisionTreeClassifier(),
    #'Random Forest': RandomForestClassifier(),
    #'Gaussian Naïve Bayes': GaussianNB(),
    #'K-nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    #'Neural Network': MLPClassifier(max_iter=1000)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))
    print("-" * 55)


ModuleNotFoundError: No module named 'pandas'