In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [173]:
df_test = pd.read_csv('crime_train.csv')
if 'weapon' in df_test.columns:
    df_test['weapon'] = df_test['weapon'].fillna('Unknown')
if 'police_department' in df_test.columns:
    df_test['police_department'] = df_test['police_department'].round()
    df_test['police_department'] = df_test['police_department'].abs()
if 'case_filed' in df_test.columns:
    df_test['case_filed'] = pd.to_datetime(df_test['case_filed'], errors='coerce')
# Identify numerical columns for standardization
numerical_cols = df_test.select_dtypes(include=np.number).columns.tolist()
# Exclude 'closed', 'Unnamed: 0', 'Num' if they are in numerical_cols
cols_to_exclude = ['closed', 'Unnamed: 0', 'Num']
numerical_cols = [col for col in numerical_cols if col not in cols_to_exclude]
# Apply Z-score normalization to the numerical columns
for col in numerical_cols:
    mean = df_test[col].mean()
    std = df_test[col].std()
    if std > 0:
        df_test[col] = (df_test[col] - mean) / std
    else:
        # If std is 0, all values are the same, so normalization is not needed
        df_test[col] = df_test[col] - mean
# Encode categorical features
categorical_cols = ['city', 'crime_description', 'weapon', 'domain', 'sex']
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)
# Label encode 'closed'
if 'closed' in df_test.columns:
  df_test['closed'] = df_test['closed'].map({'No': 0, 'Yes': 1})

In [174]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

In [105]:
def compute_gradient(X, y, w, b, lambda_=0):
  m = X.shape[0]
  z = np.dot(X, w) + b
  f_wb = sigmoid(z)
  error = f_wb - y
  dw = (1/m) * np.dot(X.T, error)
  dw += (lambda_ / m) * w
  db = (1/m) * np.sum(error)
  return dw, db

In [106]:
def compute_cost(X, y, w, b, lambda_=0):
  m = X.shape[0]
  z = np.dot(X, w) + b
  f_wb = sigmoid(z)
  loss = (-1/m) * np.sum(y * np.log(f_wb) + (1 - y) * np.log(1 - f_wb))
  reg_cost = (lambda_ / (2 * m)) * np.sum(w**2)
  total_cost = loss + reg_cost
  return total_cost

In [None]:
def train_model(X, y, learning_rate, num_iterations, w=None, b=None, lambda_=0):
    m, n = X.shape
    if w is None:
        w = np.zeros(n)
    if b is None:
        b = 0
    costs = []
    for i in range(num_iterations):
        dw, db = compute_gradient(X, y, w, b, lambda_)
        w = w - learning_rate * dw
        b = b - learning_rate * db
        cost = compute_cost(X, y, w, b, lambda_)
        costs.append(cost)
        # Print cost and gradients every 100 iterations
        if i % 100 == 0:
            print(f"Cost after iteration {i}: {cost}")
            print(f"Gradients at iteration {i}: dw={dw[:5]}..., db={db}") # Print first 5 dw elements
    return w, b, costs
# Prepare data for training
# Convert boolean columns to float before converting to numpy array
X_train_np = X_train.astype(float).to_numpy()
y_train_np = y_train.to_numpy()
learning_rate = 0.000005 # Use the stable learning rate
num_iterations = 1000 # number of iterations
lambda_value = 0.1  # Small lambda value for regularization
# Train the model
w_trained, b_trained, training_costs = train_model(X_train_np, y_train_np, learning_rate, num_iterations, lambda_=lambda_value)
print("Training finished with regularization.")
print(f"Learned weights: {w_trained}")
print(f"Learned bias: {b_trained}")

In [145]:
df_test_eval = pd.read_csv('crime_test.csv')
if 'weapon' in df_test_eval.columns:
    df_test_eval['weapon'] = df_test_eval['weapon'].fillna('Unknown')
# Handles 'police_department' column (round off and taking absolute value)
if 'police_department' in df_test_eval.columns:
    df_test_eval['police_department'] = df_test_eval['police_department'].round()
    df_test_eval['police_department'] = df_test_eval['police_department'].abs()
# Convert 'case_filed' to datetime
if 'case_filed' in df_test_eval.columns:
    df_test_eval['case_filed'] = pd.to_datetime(df_test_eval['case_filed'], errors='coerce')
# Identify numerical columns for standardization
numerical_cols_test = df_test_eval.select_dtypes(include=np.number).columns.tolist()

# Exclude 'closed', 'Unnamed: 0', 'Num' if they are in numerical_cols_test
cols_to_exclude = ['closed', 'Unnamed: 0', 'Num']
numerical_cols_test = [col for col in numerical_cols_test if col not in cols_to_exclude]

for col in numerical_cols_test:
    mean_train = df_test[col].mean()
    std_train = df_test[col].std()
    if std_train > 0:
        df_test_eval[col] = (df_test_eval[col] - mean_train) / std_train
    else:
        df_test_eval[col] = df_test_eval[col] - mean_train

categorical_cols = ['city', 'crime_description', 'weapon', 'domain', 'sex']
df_test_eval = pd.get_dummies(df_test_eval, columns=categorical_cols, drop_first=True)

# Label encoding the 'closed' column
if 'closed' in df_test_eval.columns:
  df_test_eval['closed'] = df_test_eval['closed'].map({'No': 0, 'Yes': 1})

In [152]:
# Identify numerical and boolean columns
numerical_cols = df_test.select_dtypes(include=np.number).columns.tolist()
boolean_cols = df_test.select_dtypes(include='bool').columns.tolist()

# Combining them
base_features = numerical_cols + boolean_cols

# Exclude the target variable and unrequired factors
cols_to_exclude = ['closed', 'Unnamed: 0', 'Num', 'case_filed']
base_features = [col for col in base_features if col not in cols_to_exclude]

print("Base features for polynomial expansion:")
print(base_features)

Base features for polynomial expansion:
['area', 'age', 'police_department', 'city_Ahmedabad', 'city_Bangalore', 'city_Bhopal', 'city_Chennai', 'city_Delhi', 'city_Faridabad', 'city_Ghaziabad', 'city_Hyderabad', 'city_Indore', 'city_Jaipur', 'city_Kalyan', 'city_Kanpur', 'city_Kolkata', 'city_Lucknow', 'city_Ludhiana', 'city_Meerut', 'city_Mumbai', 'city_Nagpur', 'city_Nashik', 'city_Patna', 'city_Pune', 'city_Rajkot', 'city_Srinagar', 'city_Surat', 'city_Thane', 'city_Varanasi', 'city_Vasai', 'city_Visakhapatnam', 'crime_description_ASSAULT', 'crime_description_BURGLARY', 'crime_description_COUNTERFEITING', 'crime_description_CYBERCRIME', 'crime_description_DOMESTIC VIOLENCE', 'crime_description_DRUG OFFENSE', 'crime_description_EXTORTION', 'crime_description_FIREARM OFFENSE', 'crime_description_FRAUD', 'crime_description_HOMICIDE', 'crime_description_IDENTITY THEFT', 'crime_description_ILLEGAL POSSESSION', 'crime_description_KIDNAPPING', 'crime_description_PUBLIC INTOXICATION', 'crim

In [None]:
# Define polynomial degree
degree = 2
# Creating a copy of the training DataFrame to add the polynomial features
df_train_poly = df_test.copy()
# Iterate through each base feature and create polynomial features
for feature in base_features:
    # Ensure the feature is treated as numeric for calculation
    df_train_poly[feature] = df_train_poly[feature].astype(float)
    for d in range(2, degree + 1):
        new_feature_name = f'{feature}_power_{d}'
        df_train_poly[new_feature_name] = df_train_poly[feature] ** d

In [153]:
# Create interaction features for unique pairs of base features
for i in range(len(base_features)):
    for j in range(i + 1, len(base_features)):
        feature1 = base_features[i]
        feature2 = base_features[j]
        new_feature_name = f'{feature1}_x_{feature2}'
        df_train_poly[new_feature_name] = df_train_poly[feature1] * df_train_poly[feature2]

In [154]:
# Drop the columns 'closed', 'Unnamed: 0', 'Num', and 'case_filed'
X_train = df_train_poly.drop(['closed', 'Unnamed: 0', 'Num', 'case_filed'], axis=1)

# Convert the X_train DataFrame to a NumPy array with a float data type
X_train_np = X_train.astype(float).to_numpy()

# Select the 'closed' column to create the target variable vector
y_train = df_train_poly['closed']

# Convert the y_train Series to a NumPy array
y_train_np = y_train.to_numpy()

print("Shape of X_train_np:", X_train_np.shape)
print("Shape of y_train_np:", y_train_np.shape)

Shape of X_train_np: (22489, 2015)
Shape of y_train_np: (22489,)


In [None]:
learning_rate = 0.00005
num_iterations = 1000 # number of iterations
lambda_value = 0.1  # Small lambda value for regularization

# Train the model using the expanded feature matrix X_train_np and target vector y_train_np
w_trained, b_trained, training_costs = train_model(X_train_np, y_train_np, learning_rate, num_iterations, lambda_=lambda_value)

# Print completion message and learned parameters
print("Training finished with regularization on expanded features.")
# Print only a portion of the weights as they can be numerous
print(f"Learned weights (first 10): {w_trained[:10]}")
print(f"Learned bias: {b_trained}")

In [None]:
# Prepare test data for prediction
X_test_poly_np = df_test_eval_poly.to_numpy()
# Create a copy of the test DataFrame to add polynomial features
df_test_eval_poly = df_test_eval.copy()

# Drop non-feature columns from the test data
df_test_eval_poly = df_test_eval_poly.drop(['closed', 'Unnamed: 0', 'Num', 'case_filed'], axis=1)

# Identify the base features in the test data for polynomial expansion
# These should be all columns now in df_test_eval_poly
base_features_test = df_test_eval_poly.columns.tolist()

# Ensure all base features are treated as numeric for calculation
for feature in base_features_test:
    df_test_eval_poly[feature] = df_test_eval_poly[feature].astype(float)

# Generating polynomial features for test data
degree = 2
for feature in base_features_test:
    for d in range(2, degree + 1):
        new_feature_name = f'{feature}_power_{d}'
        # Add the new polynomial feature to the test DataFrame
        df_test_eval_poly[new_feature_name] = df_test_eval_poly[feature] ** d

# Generate interaction terms for test datag
for i in range(len(base_features_test)):
    for j in range(i + 1, len(base_features_test)):
        feature1 = base_features_test[i]
        feature2 = base_features_test[j]
        new_feature_name = f'{feature1}_x_{feature2}'
        # Adding the new interaction feature to the test DataFrame
        df_test_eval_poly[new_feature_name] = df_test_eval_poly[feature1] * df_test_eval_poly[feature2]

In [172]:
# Make predictions on the expanded test data
# Use the weights and bias trained on the expanded features (from cell d98d0dff)
test_predictions_poly = predict(X_test_poly_np, w_trained, b_trained)

print("Predictions on expanded test data:")
print(test_predictions_poly)

Predictions on expanded test data:
[1 1 1 ... 1 0 1]


In [169]:
from sklearn.metrics import accuracy_score

# Get the actual values from the test set
y_test_actual = df_test_eval['closed'].to_numpy()

# Calculate accuracy
accuracy_poly = accuracy_score(y_test_actual, test_predictions_poly)

print(f"Model Accuracy on Expanded Test Data: {accuracy_poly:.4f}")

Model Accuracy on Expanded Test Data: 0.5045
