<a href="https://colab.research.google.com/github/anandjs11/crimedata/blob/main/LRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install PySpark
# http://spark.apache.org/docs/latest/api/python/index.html

!pip install pyspark==3.2

In [None]:
# start spark sessnon and configureation
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.master("local[2]").config("spark.driver.memory", "15g").appName("Crimedata").getOrCreate()
sc = spark.sparkContext
#create an instance of SQLContext
sqlContext = SQLContext(spark)

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
# Import libraries and other functions
from io import StringIO
from collections import namedtuple
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from statsmodels.tsa.arima.model import ARIMA

import csv
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

In [None]:
# Define the base directory path where all CSV files are located (ukcrime data)
base_directory_path = "/content/gdrive/MyDrive/dataset/extracted"

# Define the pattern to match all CSV files inside subfolders
csv_files_pattern = base_directory_path + "/*/*street.csv"

# Read all CSV files using the specified pattern
CrimeData = spark.read.option("header", "true") \
    .option("delimiter", ",") \
    .option("inferSchema", "true") \
    .csv(csv_files_pattern)

In [None]:
from pyspark.sql.functions import col

# Filter out rows with missing Latitude
CrimeData = CrimeData.filter(col("Latitude").isNotNull())

In [None]:
CrimeData = CrimeData.drop('Crime ID','Falls within','LSOA code','LSOA name','Last outcome category','Context')
print(f"{len(CrimeData.columns)} columns in the output dataframe")

In [None]:
# tidy up the column names

CrimeData = CrimeData.withColumnRenamed('Reported by', 'Reported_by')
CrimeData = CrimeData.withColumnRenamed('Crime type', 'Crime_type')

Start Regression

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder.appName("LinearRegressionForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "Violence and sexual offences" and all 'Reported_by'
filtered_data = pandas_df[pandas_df['Crime_type'] == 'Violence and sexual offences']

# Get a list of unique 'Reported_by' values
reported_by_values = filtered_data['Reported_by'].unique()

# Perform Linear Regression forecasting for each 'Reported_by' value
for reported_by in reported_by_values:
    reported_by_data = filtered_data[filtered_data['Reported_by'] == reported_by]

    # Group by month and calculate the sum of crimes
    reported_by_monthly = reported_by_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    reported_by_monthly['Month'] = pd.to_datetime(reported_by_monthly['Month'], format='%Y-%m')
    reported_by_monthly.set_index('Month', inplace=True)

    # Prepare data for Linear Regression
    X = reported_by_monthly.index.values.astype(int).reshape(-1, 1)  # Convert datetime to integer
    y = reported_by_monthly['Count'].values

    # Create and train the Linear Regression model
    model = LinearRegression()
    model.fit(X, y)

    # Forecast the next 6 months
    forecast_steps = 6
    last_date = reported_by_monthly.index[-1]
    forecast_dates = pd.date_range(start=last_date, periods=forecast_steps, freq='M')
    forecast_dates_int = forecast_dates.astype(int).values.reshape(-1, 1)  # Convert datetime to integer
    forecast = model.predict(forecast_dates_int)

    # Plot the original data and the forecast
    plt.figure(figsize=(10, 6))
    plt.plot(reported_by_monthly.index, reported_by_monthly['Count'], label='Original Data')
    plt.plot(forecast_dates, forecast, label='Forecast', linestyle='dashed')
    plt.title(f'Linear Regression Forecasting for Crime Type: Violence and sexual offences - Reported by: {reported_by}')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.legend()
    plt.show()

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from tabulate import tabulate

# Initialize a Spark session
spark = SparkSession.builder.appName("LinearRegressionForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "Violence and sexual offences" and all 'Reported_by'
filtered_data = pandas_df[pandas_df['Crime_type'] == 'Violence and sexual offences']

# Get a list of unique 'Reported_by' values
reported_by_values = filtered_data['Reported_by'].unique()

forecast_results = []

# Perform Linear Regression forecasting for each 'Reported_by' value
for reported_by in reported_by_values:
    reported_by_data = filtered_data[filtered_data['Reported_by'] == reported_by]

    # Group by month and calculate the sum of crimes
    reported_by_monthly = reported_by_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    reported_by_monthly['Month'] = pd.to_datetime(reported_by_monthly['Month'], format='%Y-%m')
    reported_by_monthly.set_index('Month', inplace=True)

    # Prepare data for Linear Regression
    X = reported_by_monthly.index.values.astype(int).reshape(-1, 1)  # Convert datetime to integer
    y = reported_by_monthly['Count'].values

    # Create and train the Linear Regression model
    model = LinearRegression()
    model.fit(X, y)

    # Forecast the next 6 months
    forecast_steps = 6
    last_date = reported_by_monthly.index[-1]
    forecast_dates = pd.date_range(start=last_date, periods=forecast_steps, freq='M')
    forecast_dates_int = forecast_dates.astype(int).values.reshape(-1, 1)  # Convert datetime to integer
    forecast = model.predict(forecast_dates_int)

    # Calculate MAE and MAPE
    actual_values = reported_by_monthly['Count'].tail(forecast_steps)
    mae = mean_absolute_error(actual_values, forecast)
    mape = mean_absolute_percentage_error(actual_values, forecast) * 100

    forecast_results.append({
        'Reported_by': reported_by,
        'Forecast': forecast,
        'MAE': mae,
        'MAPE': mape
    })

    # # Plot the original data and the forecast
    # plt.figure(figsize=(10, 6))
    # plt.plot(reported_by_monthly.index, reported_by_monthly['Count'], label='Original Data')
    # plt.plot(forecast_dates, forecast, label='Forecast', linestyle='dashed')
    # plt.title(f'Linear Regression Forecasting for Crime Type: Violence and sexual offences - Reported by: {reported_by}')
    # plt.xlabel('Date')
    # plt.ylabel('Count')
    # plt.legend()
    # plt.show()

# Create a DataFrame from the forecast results
forecast_df = pd.DataFrame(forecast_results)

# Display the tabular forecast results
table = tabulate(forecast_df, headers='keys', tablefmt='psql')
print(table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load your data into 'metropolitan_data'

# Filter data for "Thames Valley Police"
metropolitan_data =pandas_df[pandas_df['Reported_by'] == 'West Yorkshire Police']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Plot the forecasted results as a time series
plt.figure(figsize=(12, 6))
for index, row in forecast_df.iterrows():
    forecast_dates = pd.date_range(start=crime_type_monthly.index[-1], periods=6, freq='M')
    plt.plot(forecast_dates, row['Forecast'], label=row['Crime_Type'])

plt.xlabel('Month')
plt.ylabel('Forecasted Count')
plt.title('Crime Type Forecast Time Series')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from tabulate import tabulate
from statsmodels.tsa.arima.model import ARIMA
from pyspark.sql import SparkSession
from sklearn.metrics import mean_squared_error

# Initialize a Spark session
spark = SparkSession.builder.appName("ARIMAForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "Metropolitan Police Service"
metropolitan_data = pandas_df[pandas_df['Reported_by'] == 'Metropolitan Police Service']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform ARIMA forecasting for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Perform ARIMA forecasting
    p, d, q = 1, 1, 1
    model = ARIMA(crime_type_monthly['Count'], order=(p, d, q))
    results = model.fit()

    # Forecast the next 6 months
    forecast = results.forecast(steps=6)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(crime_type_monthly['Count'][-6:], forecast)

    # Append forecasted values and MSE to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast), 'MSE': mse})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Display the forecast results in tabular form
formatted_table = tabulate(forecast_df, headers='keys', tablefmt='pretty')

print(formatted_table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load your data into 'metropolitan_data'

# Filter data for "Thames Valley Police"
metropolitan_data =pandas_df[pandas_df['Reported_by'] == 'Metropolitan Police Service']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Plot the forecasted results as a time series
plt.figure(figsize=(12, 6))
for index, row in forecast_df.iterrows():
    forecast_dates = pd.date_range(start=crime_type_monthly.index[-1], periods=6, freq='M')
    plt.plot(forecast_dates, row['Forecast'], label=row['Crime_Type'])

plt.xlabel('Month')
plt.ylabel('Forecasted Count')
plt.title('Crime Type Forecast Time Series')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("LinearRegressionForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "West Yorkshire Police"
metropolitan_data = pandas_df[pandas_df['Reported_by'] == 'Thames Valley Police']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
# Prepare features (X) and target (y) for linear regression
    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

# ... (rest of the code remains the same)


# ... (rest of the code remains the same)


    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Display the forecast results in tabular form
formatted_table = tabulate(forecast_df, headers='keys', tablefmt='pretty')

print(formatted_table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load your data into 'metropolitan_data'

# Filter data for "Thames Valley Police"
metropolitan_data =pandas_df[pandas_df['Reported_by'] == 'Thames Valley Police']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Plot the forecasted results as a time series
plt.figure(figsize=(12, 6))
for index, row in forecast_df.iterrows():
    forecast_dates = pd.date_range(start=crime_type_monthly.index[-1], periods=6, freq='M')
    plt.plot(forecast_dates, row['Forecast'], label=row['Crime_Type'])

plt.xlabel('Month')
plt.ylabel('Forecasted Count')
plt.title('Crime Type Forecast Time Series')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("LinearRegressionForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "West Yorkshire Police"
metropolitan_data = pandas_df[pandas_df['Reported_by'] == 'Avon and Somerset Constabulary']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
# Prepare features (X) and target (y) for linear regression
    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

# ... (rest of the code remains the same)


# ... (rest of the code remains the same)


    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Display the forecast results in tabular form
formatted_table = tabulate(forecast_df, headers='keys', tablefmt='pretty')

print(formatted_table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load your data into 'metropolitan_data'

# Filter data for "Thames Valley Police"
metropolitan_data =pandas_df[pandas_df['Reported_by'] == 'Avon and Somerset Constabulary']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Plot the forecasted results as a time series
plt.figure(figsize=(12, 6))
for index, row in forecast_df.iterrows():
    forecast_dates = pd.date_range(start=crime_type_monthly.index[-1], periods=6, freq='M')
    plt.plot(forecast_dates, row['Forecast'], label=row['Crime_Type'])

plt.xlabel('Month')
plt.ylabel('Forecasted Count')
plt.title('Crime Type Forecast Time Series')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from tabulate import tabulate
from sklearn.linear_model import LinearRegression
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("LinearRegressionForecasting").getOrCreate()

# Assuming 'CrimeData' is your Spark DataFrame
# If not, replace 'CrimeData' with your actual DataFrame name
# CrimeData = spark.table("CrimeData")

# Convert the Spark DataFrame to a Pandas DataFrame
# pandas_df = CrimeData.toPandas()

# Filter data for "West Yorkshire Police"
metropolitan_data = pandas_df[pandas_df['Reported_by'] == 'Cambridgeshire Constabulary']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
# Prepare features (X) and target (y) for linear regression
    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

# ... (rest of the code remains the same)


# ... (rest of the code remains the same)


    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Display the forecast results in tabular form
formatted_table = tabulate(forecast_df, headers='keys', tablefmt='pretty')

print(formatted_table)

# Stop the Spark session
spark.stop()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Load your data into 'metropolitan_data'

# Filter data for "Thames Valley Police"
metropolitan_data =pandas_df[pandas_df['Reported_by'] == 'Cambridgeshire Constabulary']

# Get a list of unique crime types
crime_types = metropolitan_data['Crime_type'].unique()

# Create a list to store the forecasted results
forecast_results = []

# Perform linear regression for each crime type
for crime_type in crime_types:
    crime_type_data = metropolitan_data[metropolitan_data['Crime_type'] == crime_type]

    # Group by month and calculate the sum of crimes
    crime_type_monthly = crime_type_data.groupby('Month').size().reset_index(name='Count')

    # Convert the 'Month' column to datetime
    crime_type_monthly['Month'] = pd.to_datetime(crime_type_monthly['Month'], format='%Y-%m')
    crime_type_monthly.set_index('Month', inplace=True)

    # Prepare features (X) and target (y) for linear regression
    X = pd.DataFrame({"Month": range(len(crime_type_monthly) + 1, len(crime_type_monthly) + 7)})
    y = crime_type_monthly['Count'].values[-6:]  # Use the last 6 months' data

    # Create a Linear Regression model
    model = LinearRegression()

    # Fit the model
    model.fit(X, y)

    # Predict the next 6 months using the model
    forecast = model.predict(X[-6:])

    # Append forecasted values to the list
    forecast_results.append({'Crime_Type': crime_type, 'Forecast': list(forecast)})

# Convert the list of forecast results to a Pandas DataFrame
forecast_df = pd.DataFrame(forecast_results)

# Plot the forecasted results as a time series
plt.figure(figsize=(12, 6))
for index, row in forecast_df.iterrows():
    forecast_dates = pd.date_range(start=crime_type_monthly.index[-1], periods=6, freq='M')
    plt.plot(forecast_dates, row['Forecast'], label=row['Crime_Type'])

plt.xlabel('Month')
plt.ylabel('Forecasted Count')
plt.title('Crime Type Forecast Time Series')
plt.legend()
plt.grid(True)
plt.show()