<a href="https://colab.research.google.com/github/anandjs11/crimedata/blob/main/GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install PySpark
# http://spark.apache.org/docs/latest/api/python/index.html

!pip install pyspark==3.2

In [None]:
# start spark sessnon and configureation
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.master("local[2]").config("spark.driver.memory", "15g").appName("Crimedata").getOrCreate()
sc = spark.sparkContext
#create an instance of SQLContext
sqlContext = SQLContext(spark)

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
# Import libraries and other functions
from io import StringIO
from collections import namedtuple
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from statsmodels.tsa.arima.model import ARIMA

import csv
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

In [None]:
# Define the base directory path where all CSV files are located (ukcrime data)
base_directory_path = "/content/gdrive/MyDrive/dataset/extracted"

# Define the pattern to match all CSV files inside subfolders
csv_files_pattern = base_directory_path + "/*/*street.csv"

# Read all CSV files using the specified pattern
CrimeData = spark.read.option("header", "true") \
    .option("delimiter", ",") \
    .option("inferSchema", "true") \
    .csv(csv_files_pattern)

In [None]:
from pyspark.sql.functions import col

# Filter out rows with missing Latitude
CrimeData = CrimeData.filter(col("Latitude").isNotNull())

In [None]:
CrimeData = CrimeData.drop('Crime ID','Falls within','LSOA code','LSOA name','Last outcome category','Context')
print(f"{len(CrimeData.columns)} columns in the output dataframe")

In [None]:
# tidy up the column names

CrimeData = CrimeData.withColumnRenamed('Reported by', 'Reported_by')
CrimeData = CrimeData.withColumnRenamed('Crime type', 'Crime_type')

Start Neural Networks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder.appName("GRUForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "Violence and sexual offences" and all 'Reported_by'
filtered_data = pandas_df[pandas_df['Crime_type'] == 'Violence and sexual offences']

# Get a list of unique 'Reported_by' values
reported_by_values = filtered_data['Reported_by'].unique()

# Set the time window for training data
train_window = 12  # Use the last 12 months to train the model

# Perform GRU forecasting for each 'Reported_by' value
for reported_by in reported_by_values:
    reported_by_data = filtered_data[filtered_data['Reported_by'] == reported_by]

    # Group by month and calculate the sum of crimes
    reported_by_monthly = reported_by_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    reported_by_monthly['Month'] = pd.to_datetime(reported_by_monthly['Month'], format='%Y-%m')
    reported_by_monthly.set_index('Month', inplace=True)

    # Normalize data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(reported_by_monthly[['Count']])

    # Prepare data for training
    X, y = [], []
    for i in range(train_window, len(scaled_data)):
        X.append(scaled_data[i - train_window:i, 0])
        y.append(scaled_data[i, 0])
    X, y = np.array(X), np.array(y)

    # Reshape input to be [samples, time steps, features]
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # Create and train the GRU model
    model = Sequential()
    model.add(GRU(units=50, activation='relu', input_shape=(X.shape[1], 1)))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X, y, epochs=50, batch_size=16)

    # Forecast the next 6 months
    forecast_steps = 6
    last_window = scaled_data[-train_window:, 0]
    forecast = []
    for i in range(forecast_steps):
        input_data = last_window[-train_window:].reshape(1, train_window, 1)
        predicted_value = model.predict(input_data)
        forecast.append(predicted_value)
        last_window = np.append(last_window, predicted_value)

    forecast = np.array(forecast).flatten()
    forecast = scaler.inverse_transform(forecast.reshape(-1, 1)).flatten()

    # Create forecast date range
    forecast_dates = pd.date_range(start=reported_by_monthly.index[-1], periods=forecast_steps + 1, freq='M')[1:]

    # Plot the original data and the forecast
    plt.figure(figsize=(10, 6))
    plt.plot(reported_by_monthly.index, reported_by_monthly['Count'], label='Original Data')
    plt.plot(forecast_dates, forecast, label='Forecast', linestyle='dashed')
    plt.title(f'GRU Forecasting for Crime Type: Violence and sexual offences - Reported by: {reported_by}')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.legend()
    plt.show()


# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from tabulate import tabulate

# Initialize a Spark session
spark = SparkSession.builder.appName("GRUForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "Violence and sexual offences" and all 'Reported_by'
filtered_data = pandas_df[pandas_df['Crime_type'] == 'Violence and sexual offences']

# Get a list of unique 'Reported_by' values
reported_by_values = filtered_data['Reported_by'].unique()

# Set the time window for training data
train_window = 12  # Use the last 12 months to train the model

# Initialize a list to store forecast results
forecast_results = []

# Perform GRU forecasting for each 'Reported_by' value
for reported_by in reported_by_values:
    reported_by_data = filtered_data[filtered_data['Reported_by'] == reported_by]

    # Group by month and calculate the sum of crimes
    reported_by_monthly = reported_by_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    reported_by_monthly['Month'] = pd.to_datetime(reported_by_monthly['Month'], format='%Y-%m')
    reported_by_monthly.set_index('Month', inplace=True)

    # Normalize data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(reported_by_monthly[['Count']])

    # Prepare data for training
    X, y = [], []
    for i in range(train_window, len(scaled_data)):
        X.append(scaled_data[i - train_window:i, 0])
        y.append(scaled_data[i, 0])
    X, y = np.array(X), np.array(y)

    # Reshape input to be [samples, time steps, features]
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # Create and train the GRU model
    model = Sequential()
    model.add(GRU(units=50, activation='relu', input_shape=(X.shape[1], 1)))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X, y, epochs=50, batch_size=16)

    # Forecast the next 6 months
    forecast_steps = 6
    last_window = scaled_data[-train_window:, 0]
    forecast = []
    for i in range(forecast_steps):
        input_data = last_window[-train_window:].reshape(1, train_window, 1)
        predicted_value = model.predict(input_data)
        forecast.append(predicted_value)
        last_window = np.append(last_window, predicted_value)

    forecast = np.array(forecast).flatten()
    forecast = scaler.inverse_transform(forecast.reshape(-1, 1)).flatten()

    # Create forecast date range
    forecast_dates = pd.date_range(start=reported_by_monthly.index[-1], periods=forecast_steps + 1, freq='M')[1:]

    # # Plot the original data and the forecast
    # plt.figure(figsize=(10, 6))
    # plt.plot(reported_by_monthly.index, reported_by_monthly['Count'], label='Original Data')
    # plt.plot(forecast_dates, forecast, label='Forecast', linestyle='dashed')
    # plt.title(f'GRU Forecasting for Crime Type: Violence and sexual offences - Reported by: {reported_by}')
    # plt.xlabel('Date')
    # plt.ylabel('Count')
    # plt.legend()
    # plt.show()

    # Calculate MAE and MAPE
    actual_values = reported_by_monthly['Count'][-forecast_steps:].to_numpy()
    mae = np.abs(forecast - actual_values).mean()
    mape = (np.abs(forecast - actual_values) / actual_values).mean() * 100

    # Append results to the forecast_results list
    forecast_results.append({
        'Reported_by': reported_by,
        'Forecast': forecast,
        'MAE': mae,
        'MAPE': mape
    })

# Create a DataFrame from the forecast results
forecast_df = pd.DataFrame(forecast_results)

# Display the tabular forecast results using tabulate
table = tabulate(forecast_df, headers='keys', tablefmt='psql')
print(table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
from tabulate import tabulate
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("GRUForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
pandas_df = CrimeData.toPandas()

# Filter data for "Avon and Somerset Constabulary"
metropolitan_data = pandas_df[pandas_df['Reported_by'] == 'Metropolitan Police Service']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Create a list to store the accuracy metrics
accuracy_results = []

# Define hyperparameters
input_sequence_length = 12  # Number of months to use as input sequence
output_sequence_length = 6  # Number of months to forecast
hidden_units = 64
batch_size = 32
epochs = 50

# Perform GRU forecasting for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Normalize the data
    scaler = MinMaxScaler()
    crime_type_monthly['Count'] = scaler.fit_transform(crime_type_monthly[['Count']])

    # Prepare the data for training
    data = np.array(crime_type_monthly['Count'])
    X, y = [], []

    for i in range(len(data) - input_sequence_length - output_sequence_length + 1):
        X.append(data[i:i + input_sequence_length])
        y.append(data[i + input_sequence_length:i + input_sequence_length + output_sequence_length])

    X = np.array(X)
    y = np.array(y)

    # Build the GRU model
    model = Sequential([
        GRU(hidden_units, input_shape=(input_sequence_length, 1)),
        Dense(output_sequence_length)
    ])

    model.compile(optimizer=Adam(), loss='mse')

    # Train the model
    model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=0)

    # Forecast the next 6 months
    last_sequence = data[-input_sequence_length:]
    forecast = []

    for _ in range(output_sequence_length):
        input_data = np.array([last_sequence])
        input_data = np.reshape(input_data, (input_data.shape[0], input_data.shape[1], 1))
        predicted = model.predict(input_data)
        forecast.append(predicted[0, -1])
        last_sequence = np.append(last_sequence[1:], predicted[0, -1])

    forecast = np.array(forecast)
    forecast = scaler.inverse_transform(forecast.reshape(-1, 1)).flatten()

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

    # Extract the actual values for the forecast period
    actual_values = crime_type_monthly['Count'].values[-output_sequence_length:]

    # Calculate Mean Absolute Error (MAE)
    mae = np.mean(np.abs(actual_values - forecast))

    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((actual_values - forecast) / actual_values)) * 100

    # Append accuracy metrics to the list
    accuracy_results.append({'Crime_Type': crime_type, 'MAE': mae, 'MAPE': mape})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Convert the list of accuracy results to a Pandas DataFrame
accuracy_df = pd.DataFrame(accuracy_results)

# Display the forecast results and accuracy metrics in tabular form
formatted_forecast_table = tabulate(forecast_df, headers='keys', tablefmt='pretty')
formatted_accuracy_table = tabulate(accuracy_df, headers='keys', tablefmt='pretty')

print("Forecast Results:")
print(formatted_forecast_table)

print("\nAccuracy Metrics:")
print(formatted_accuracy_table)

# Stop the Spark session
spark.stop()
