In [11]:
import pandas as pd
from scipy.stats import chi2_contingency

# Define the file path (Replace with your actual file path)
file_path = r"C:\Users\Abhimanyu\Desktop\Github\DSCapstone\ML\content\combined_data.csv"

# Load the dataset
df1 = pd.read_csv(file_path)

# Identify categorical columns
categorical_columns = df1.select_dtypes(include=['object', 'category']).columns

# Convert categorical columns to 'category' type if not already
df1[categorical_columns] = df1[categorical_columns].astype('category')
df1['OFFENSE'] = df1['OFFENSE'].astype('category')  # Ensure 'OFFENSE' is categorical

# Perform chi-square test for 'OFFENSE' against all other categorical variables
results = []
for col in categorical_columns:
    if col != 'OFFENSE':  # Exclude 'OFFENSE' vs itself
        # Create contingency table
        contingency_table = pd.crosstab(df1['OFFENSE'], df1[col])
        
        # Perform Chi-Square Test
        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        
        # Store results
        results.append({
            'Variable': col,
            'Chi-Square Statistic': chi2_stat,
            'P-Value': p_value,
            'Degrees of Freedom': dof
        })

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

In [13]:
print(results_df)


          Variable  Chi-Square Statistic  P-Value  Degrees of Freedom
0            SHIFT          38444.326074      0.0                  14
1           METHOD         352127.397552      0.0                  14
2            BLOCK         303698.863672      0.0              139944
3              ANC          60861.326221      0.0                 322
4  VOTING_PRECINCT          70108.343402      0.0                1001


In [None]:
from scipy.stats import kruskal

# List to store significant results
significant_features = []

# Perform Kruskal-Wallis H test for each numerical column
for col in ['YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'WARD', 'DISTRICT', 'PSA', 'LATITUDE', 'LONGITUDE']:
    groups = [group[col].values for name, group in df1.groupby('OFFENSE')]
    stat, p_value = kruskal(*groups)

    if p_value < 0.05:  # Check if the p-value is less than 0.05
        significant_features.append((col, stat, p_value))

# Print significant features
print("Significant numerical features (p < 0.05):")
for feature in significant_features:
    print(f"Feature: {feature[0]}, Statistic: {feature[1]}, p-value: {feature[2]}")

Significant numerical features (p < 0.05):
Feature: YEAR, Statistic: 12869.078061260632, p-value: 0.0
Feature: MONTH, Statistic: 115.93579334134652, p-value: 5.371068151217956e-22
Feature: DAY, Statistic: 14.199319567915548, p-value: 0.04774772004140339
Feature: HOUR, Statistic: 13682.277695080911, p-value: 0.0
Feature: MINUTE, Statistic: 6948.321434582679, p-value: 0.0
Feature: WARD, Statistic: 32356.142797190932, p-value: 0.0
Feature: DISTRICT, Statistic: 38067.67156243931, p-value: 0.0
Feature: PSA, Statistic: 38478.562832166775, p-value: 0.0
Feature: LATITUDE, Statistic: 12610.926583294447, p-value: 0.0
Feature: LONGITUDE, Statistic: 31276.422230764456, p-value: 0.0
