A. Crime Prediction
Objective: Predict the likelihood of crimes in districts based on historical data.

Steps:
Data Preparation:

Select features like district_name, state_name, and crime categories.
Encode categorical features using one-hot encoding or label encoding.
Normalize numerical features if required.
Model Training:

Train a regression model (e.g., Linear Regression or Random Forest).
Use Total_Crimes (or a similar column) as the target variable.
Evaluation:

Evaluate the model using metrics like Mean Absolute Error (MAE) and Root Mean Square Error (RMSE).

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the dataset
data = pd.read_csv('districtwise-cyber-crimes-2017-onwards.csv')


In [18]:
# Feature selection
features = ['state_name', 'district_name']
# Convert all columns from the 4th column onwards to numeric, coercing errors
data.iloc[:, 3:] = data.iloc[:, 3:].apply(pd.to_numeric, errors='coerce')
data['Total_Crimes'] = data.iloc[:, 3:].sum(axis=1)  # Adjust columns as needed
target = 'Total_Crimes'

In [19]:
# Encode categorical variables
data_encoded = pd.get_dummies(data[features], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data_encoded, data[target], test_size=0.2, random_state=42)

In [None]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

B. Clustering
Objective: Group districts with similar crime patterns.
Steps:
Data Preparation:

Normalize the crime data for fair comparison.
Use the selected crime categories as features.
Clustering:

Apply K-Means or DBSCAN.
Determine the optimal number of clusters using the elbow method or silhouette score.
Visualization:

Use scatter plots or heatmaps to visualize clusters.

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'crime_features' is a DataFrame containing your features
# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
crime_features_imputed = imputer.fit_transform(crime_features)

# Normalize the crime data
scaler = StandardScaler()
crime_features_scaled = scaler.fit_transform(crime_features_imputed)

# Function to determine the optimal number of clusters using silhouette score
def find_optimal_clusters(data, max_clusters=10):
    silhouette_scores = []
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, clusters)
        silhouette_scores.append(silhouette_avg)
        print(f'Number of clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.4f}')
    return silhouette_scores

# Find optimal number of clusters
max_clusters = 10  # You can adjust this value based on your needs
silhouette_scores = find_optimal_clusters(crime_features_scaled, max_clusters)

# Plotting the Silhouette Scores to visualize the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range(2, max_clusters + 1))
plt.grid()
plt.show()

# Apply K-Means clustering with the optimal number of clusters (e.g., based on previous results)
optimal_n_clusters = silhouette_scores.index(max(silhouette_scores)) + 2  # +2 because range starts from 2
kmeans_final = KMeans(n_clusters=optimal_n_clusters, random_state=42)
clusters_final = kmeans_final.fit_predict(crime_features_scaled)

# Add the cluster labels to the original DataFrame
data['Cluster'] = clusters_final

# Visualization using scatter plot (using first two features for simplicity)
plt.figure(figsize=(10, 6))
plt.scatter(crime_features_scaled[:, 0], crime_features_scaled[:, 1], c=clusters_final, cmap='viridis', alpha=0.6)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Clusters of Districts Based on Crime Patterns (n_clusters={optimal_n_clusters})')
plt.colorbar(label='Cluster')
plt.show()


C. Anomaly Detection
Objective: Detect districts with unusual crime spikes.
Steps:
Calculate Statistics:

Compute mean and standard deviation of crimes for each district.
Detect Anomalies:

Use algorithms like Isolation Forest or Local Outlier Factor.
Highlight Results:

Present anomalies in tables or maps.

In [None]:
from sklearn.ensemble import IsolationForest

# Ensure 'district_name' is of the same type in both dataframes
data['district_name'] = data['district_name'].astype(str)

# Compute mean and standard deviation of crimes for each district
crime_stats = data.groupby('district_name')['Total_Crimes'].agg(['mean', 'std']).reset_index()

# Rename columns to avoid conflicts during merge
crime_stats.rename(columns={'mean': 'mean_crimes', 'std': 'std_crimes'}, inplace=True)

# Merge the statistics back to the original data
data = data.merge(crime_stats, on='district_name', how='left')

# Use Isolation Forest to detect anomalies
iso_forest = IsolationForest(contamination=0.05, random_state=42)
data['anomaly'] = iso_forest.fit_predict(data[['Total_Crimes']])

# Highlight anomalies
anomalies = data[data['anomaly'] == -1]

# Present anomalies in a table
print(anomalies[['district_name', 'Total_Crimes', 'mean_crimes', 'std_crimes']])

Temp Code ChatGPT


Let's proceed with A. Crime Prediction first and implement it step by step. After that, we can move on to clustering and anomaly detection. Here's how we'll go about it:

Step 1: Crime Prediction
We'll train a model to predict Total_Crimes based on district and crime category data.

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the dataset
data = pd.read_csv('districtwise-cyber-crimes-2017-onwards.csv')

# Data preparation
# Convert all columns from the 4th column onwards to numeric, coercing errors
data.iloc[:, 3:] = data.iloc[:, 3:].apply(pd.to_numeric, errors='coerce')
# Combine columns representing crime counts to calculate total crimes (adjust indices as needed)
data['Total_Crimes'] = data.iloc[:, 3:].sum(axis=1)

# Select features and target
features = ['state_name', 'district_name']  # Update columns based on your data
target = 'Total_Crimes'

# Encode categorical variables
data_encoded = pd.get_dummies(data[features], drop_first=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_encoded, data[target], test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Square Error (RMSE): {rmse}")

# Optional: Save the model for future use
import joblib
joblib.dump(model, 'crime_prediction_model.pkl')


In [None]:
# Load the dataset
data = pd.read_csv('districtwise-cyber-crimes-2017-onwards.csv')

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Check column names and types
print("\nColumn Names and Data Types:")
print(data.dtypes)

# Check for missing values
print("\nMissing Values in the Dataset:")
print(data.isnull().sum())

# Check numeric columns (assumed to be crime counts)
print("\nNumeric Columns:")
numeric_cols = data.select_dtypes(include=['number']).columns
print(numeric_cols)

# Confirm presence of required categorical columns
required_columns = ['state_name', 'district_name', 'cyber_crime_category']
for col in required_columns:
    if col in data.columns:
        print(f"Column '{col}' exists in the dataset.")
    else:
        print(f"Column '{col}' is missing!")
