In [3]:
import pandas as pd
import numpy as np
import geopandas as gpd
import folium
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn.model_selection import train_test_split

def load_clean_data(file_name):
    '''
    Load the crime data from the CSV.
    '''

    df = pd.read_csv(file_name,na_values='(null)')

    explore_data(df)
    
    #dropping rows with NaN values
    df.dropna(subset=['Y_COORD_CD','X_COORD_CD','Latitude','Longitude','CRM_ATPT_CPTD_CD','CMPLNT_FR_TM','Lat_Lon','CMPLNT_FR_DT','BORO_NM','OFNS_DESC'], inplace=True)

    #dropping columns that are not significant for future data exploration
    df.drop(['PARKS_NM','STATION_NAME','TRANSIT_DISTRICT','HADEVELOPT','HOUSING_PSA'],axis='columns', inplace=True)

    df.drop(['PREM_TYP_DESC','SUSP_AGE_GROUP','SUSP_SEX','SUSP_RACE','JURISDICTION_CODE'], axis='columns', inplace=True)

    df.drop(['ADDR_PCT_CD','PD_CD','PD_DESC','PATROL_BORO','CMPLNT_TO_DT','CMPLNT_TO_TM'], axis='columns', inplace=True)

    #replacing all NaN values in 'LOC_OF_OCCUR_DESC' with UNKNOWN
    df.fillna({'LOC_OF_OCCUR_DESC':'UNKNOWN'}, inplace=True)

    #replacing all NaN values in 'VIC_RACE' with UNKNOWN
    df.fillna({'VIC_RACE':'UNKNOWN'}, inplace=True)

    #replacing all NaN values in 'VIC_AGE_GROUP' with UNKNOWN
    df.fillna({'VIC_AGE_GROUP':'UNKNOWN'}, inplace=True)

    #replacing all NaN values in 'VIC_SEX' with UNKNOWN
    df.fillna({'VIC_SEX':'UNKNOWN'}, inplace=True)
    
    #Rename some of the crime descriptions (e.g. 'HARRASSMENT 2' to 'HARASSMENT', 'OFF. AGNST PUB ORD SENSBLTY &' to 
    # 'OFFENSES AGAINST PUBLIC ORDER/ADMINISTRATION').
    df_clean = df.replace({'HARRASSMENT 2': 'HARASSMENT', 
                'ESCAPE 3': 'ESCAPE',
                'ASSAULT 3 & RELATED OFFENSES': 'ASSAULT & RELATED OFFENSES',
                'CRIMINAL MISCHIEF & RELATED OF': 'CRIMINAL MISCHIEF',
                'OFF. AGNST PUB ORD SENSBLTY &': 'OFFENSES AGAINST PUBLIC ORDER/ADMINISTRATION',
                'OTHER STATE LAWS (NON PENAL LA': 'OTHER STATE LAWS (NON PENAL LAW)',
                'ENDAN WELFARE INCOMP': 'ENDANGERING WELFARE OF INCOMPETENT',
                'AGRICULTURE & MRKTS LAW-UNCLASSIFIED': 'AGRICULTURE & MARKETS LAW',
                'DISRUPTION OF A RELIGIOUS SERV': 'DISRUPTION OF A RELIGIOUS SERVICE',
                'LOITERING/GAMBLING (CARDS, DIC': 'GAMBLING',
                'OFFENSES AGAINST MARRIAGE UNCL': 'OFFENSES AGAINST MARRIAGE',
                'HOMICIDE-NEGLIGENT,UNCLASSIFIE': 'HOMICIDE-NEGLIGENT',
                                      'E': 'UNKNOWN',
                                      'D': 'BUSINESS/ORGANIZATION',
                                      'F': 'FEMALE',
                                      'M': 'MALE'})
    

    print('Clean dataset: ')
    print("Observations: ", df_clean.shape[0])
    print("Variables: ", df_clean.shape[1])

    return df_clean 

def calculate_crime_rates(data):
    """
    Calculate crime rates for each borough.
    """

    # Convert 'CMPLNT_FR_DT' to datetime format.
    data['CMPLNT_FR_DT'] = pd.to_datetime(data['CMPLNT_FR_DT'], errors='coerce')

    # Drop rows with missing values in the 'CMPLNT_FR_DT' column.
    data = data.dropna(subset=['CMPLNT_FR_DT'])

    # Ensure 'CMPLNT_FR_DT' is a datetime type.
    data['CMPLNT_FR_DT'] = data['CMPLNT_FR_DT'].dt.to_pydatetime()

    # Define borough populations to be used for crime rates,
    # taken from recent census.
    borough_populations = {
        'BRONX': 1379946,
        'BROOKLYN': 2590516,
        'MANHATTAN': 1596273,
        'QUEENS': 2278029,
        'STATEN ISLAND': 491133
    }

    # Create a new column 'population' based on the dictionary.
    data['population'] = data['BORO_NM'].map(borough_populations)

    # Define a reference date for calculating time difference.
    reference_date = pd.to_datetime('2023-01-01')

    # Calculate the absolute difference between the crime date and the reference date.
    data['days_since_ref'] = (data['CMPLNT_FR_DT'] - reference_date).dt.days.abs()

    # Create a new column 'crime_rate' that indicates the number
    # of crimes per person in the borough.
    data['crime_rate'] = data['days_since_ref'] / (data['population'] + 1e-10)
    # Adding a small constant to avoid division by zero.

    # Calculate the total number of crimes for each borough.
    data['Number_of_Crimes'] = data.groupby('BORO_NM')['crime_rate'].transform('sum')
    
    return data

def perform_exploratory_analysis(df_clean):

    ######## Types of Crimes #############
    
    top_crimes = df_clean['OFNS_DESC'].value_counts().iloc[:10]

    # Plot the bar graph
    top_crimes.sort_values().plot(kind="barh", title="Top 10 Types of Crimes")

    # Adjust layout to prevent y-ticks from being cut off
    plt.tight_layout()

    # Save the plot to a file
    plt.savefig('top_crimes_bar_plot.png')

    ########## levels of crime ###########
    
    level_of_offense_counts = df_clean['LAW_CAT_CD'].value_counts()

    # Plot the pie chart
    level_of_offense_counts.sort_values().plot(kind='pie', subplots=True, shadow=True, startangle=40,
                                               figsize=(10, 5), autopct='%1.1f%%', title='Level of Offense')

    # Adjust layout for better appearance
    plt.tight_layout()

    # Save the plot to a file
    plt.savefig('level_of_offense_pie_chart.png')

    ############# Distribution of crimes over the years, months and days ############
    df_clean['year'] = df_clean['RPT_DT'].str.findall(r'(\d{4})').apply(lambda x: x[0])
    df_clean['year'].value_counts().plot(kind="line", title = "Total Crime Events by Year")
    plt.tight_layout()
    plt.xlabel('Year')
    plt.ylabel('count')
    plt.savefig('YearWise_crime_Distribution.png')
    
    #reports by month
    df_clean['month'] = df_clean['RPT_DT'].str.findall('(\d{2})').apply(lambda x: x[0])
    df_clean.groupby('month').size().plot(kind = 'bar', title ='Total Crime Events by Month', color = '#C0392B',rot=0)
    plt.tight_layout()
    plt.xlabel('Month')
    plt.ylabel('count')
    plt.savefig('MonthWise_crime_Distribution.png')

    
    #reports by hour
    df_clean['time'] = df_clean['CMPLNT_FR_TM'].str.findall('(\d{2})').apply(lambda x: x[0])
    df_clean.groupby('time').size().plot(kind = 'bar', title ='Total Crime Events by Day', color = '#E67E22', xlabel = 'hours',rot=0)
    plt.tight_layout()
    plt.xlabel('Hours')
    plt.ylabel('count')
    plt.savefig('HourWise_crime_Distribution.png')
    
    
    ########  Distribution of Crime in each borough ############
    df_clean['BORO_NM'].value_counts().sort_values().plot(kind="barh",title = 'Total of Crime Events by Borough')
    plt.tight_layout()
    plt.savefig('Crime_Events_by_Borough.png')
    
    ##### Analyzing a Specific Crime ####
    sex_crimes = df_clean[df.OFNS_DESC.str.contains('SEX CRIMES|RAPE')]
    sex_crimes.head()
    
    #plot graph
    sex_crimes.groupby('year').size().plot(kind = 'barh', color = '#8E44AD', title = 'Sex Crime Reports by Year')

    plt.tight_layout()
    plt.savefig('Sex_Crime_by_Year.png')
    
    #average sex crimes per year
    mean = sex_crimes['year'].value_counts().mean()
    print('Average sex crimes per year',round(mean,2))
    
    #Sex Crime Reports by Hour
    sex_crimes.groupby('time').size().plot(kind = 'bar', color = '#3498DB', rot=0, title = 'Sex Crime Reports by Hour')
    plt.tight_layout()
    plt.savefig('Sex_Crime_by_hour.png')
    
    ########## Analyzing the victims #################
    #Sex-Crime-Victims-by-Gender
    sex_crimes['VIC_SEX'].value_counts().plot(kind="bar",  color = '#F1C40F', title = 'Sex Crime Victims by Gender', rot=0)

    plt.tight_layout()
    plt.savefig('Sex-Crime-Victims-by-Gender.png')
    
    ##victims by age percentage
    sex_crimes['VIC_AGE_GROUP'].value_counts().plot(kind="bar", color = '#2ECC71', title = 'Sex Crime Victims by Age Group', rot=0)
    plt.tight_layout()
    plt.savefig('Sex-Crime-Victims-by-Age.png')
    
    ##victims by race percentage
    sex_crimes['VIC_RACE'].value_counts().plot(kind="barh", color = '#D35400', title = 'Sex Crime Victims by Race')


def explore_data(data):
    """
    Display basic information about the data.
    """
    
    print("First few rows of the data:")
    print(data.head())

    print("\nData information:")
    print(data.info())

    print("\nSummary statistics:")
    print(data.describe())

def create_time_series_plot_by_time(data):
    """
    Create a time series plot showing the 
    distribution of crimes over time for each borough.
    
    Allows for a comparison of crime patterns and 
    what times most crimes occur across different boroughs.
    """

    # Exclude rows with 'Unknown' borough
    data_filtered = data[data['BORO_NM'] != 'UNKNOWN']

    # Group by 'BORO_NM' and 'Hour', and count the occurrences
    crime_by_time = data_filtered.groupby(['BORO_NM', 'hour']).size().unstack()

    # Plotting the time series plot
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=crime_by_time.T, markers=True)

    # Format x-axis ticks with custom time format
    plt.xticks(range(3, 25, 3), ['12 am' if hour % 24 == 0 else '12 pm' if hour % 12 == 0
                                 else '{} am'.format(hour % 12) if hour < 12
                                 else '{} pm'.format(hour % 12) for hour in range(3, 25, 3)])

    # Set x-axis limits to start at 3 am and end at 12 am.
    plt.xlim(3, 24)

    plt.title('Crime Distribution by Time of Day for Each Borough')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Number of Crimes')
    plt.legend(title='Borough', loc='upper right')

    plt.show()


def create_choropleth_map(data, geojson_path):
    """
    Create a choropleth map of crime hotspots across NYC boroughs.
    
    Visualizes the spatial distribution of crime rates.
    """

    # Read GeoJSON file with borough boundaries.
    boroughs = gpd.read_file(geojson_path)
    
    # make the boro_name upper case since , its in upper case in the big csv file
    boroughs['boro_name'] = boroughs['boro_name'] .apply(lambda x: x.upper())

    # Group by borough and calculate the mean crime rate for each borough.
    crime_rates_by_borough = data.groupby('BORO_NM')['crime_rate'].mean().reset_index()

    # Create a folium map.
    nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

    # Add choropleth layer to the map.
    folium.Choropleth(
        geo_data=boroughs,
        name='choropleth',
        data=crime_rates_by_borough,
        columns=['BORO_NM', 'crime_rate'], # columns from crime_rates_by_borough
        key_on='feature.properties.boro_name',
        fill_color='BrBG',
        fill_opacity=0.7,
        # line_opacity=0.2,
        legend_name='NYC Borough Boundaries Choropleth Map'
    ).add_to(nyc_map)

    # # Add layer control to the map
    folium.LayerControl().add_to(nyc_map)

    # Save the map as an HTML file.
    nyc_map.save('crime_rate_map.html')
    # Save the map as a PNG file.
#     nyc_map.save('crime_rate_map.png', path='crime_rate_map.png', close_file=False)
    nyc_map.save('crime_rate_map.png')

# Model Performance Plot:(NEED)
def apply_model(df):
    
    # choose only important features for prediction.
    # https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i
    imp_cols = ['KY_CD',
    'CRM_ATPT_CPTD_CD',
    'LAW_CAT_CD',
    'BORO_NM',
    'LOC_OF_OCCUR_DESC',
    'JURIS_DESC',
                'SUSP_AGE_GROUP',
                'SUSP_RACE',
                'SUSP_SEX',
                'TRANSIT_DISTRICT',
                
    'VIC_AGE_GROUP',
    'VIC_RACE',
    'VIC_SEX',
    'population',
    'crime_rate',
    'Number_of_Crimes']

    # filter the dataset
    df = df[imp_cols].copy()
    
    # Get X and y
    X = df.drop(columns='Number_of_Crimes')
    y = df['Number_of_Crimes']

    X['KY_CD'] = X['KY_CD'].astype('str')
    
    X = pd.get_dummies(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    model = LinearRegression() 

    # fitting the model 
    model.fit(X_train,y_train)
    
    # making predictions 
    predictions = model.predict(X_test) 

    # model evaluation 
    print('mean_squared_error : ', mean_squared_error(y_test, predictions))
    print('Root_mean_squared_error : ', mean_squared_error(y_test, predictions,squared=False)) 
    print('mean_absolute_error : ', mean_absolute_error(y_test, predictions)) 


def main():
    """
    Main function to orchestrate the overall flow of the script.
    """

    # Replace file paths for csv & geojson when using laptop
    data_path = r'D:\OneDrive - NITT\Custom_Download\NYPD_Complaint_Data_Historic.csv'
    geojson_path = r'D:\OneDrive - NITT\Custom_Download\Borough Boundaries.geojson'

    # Load data
    clean_data = load_clean_data(data_path)

    # EDA
    perform_exploratory_analysis(clean_data)
    
    # Create time series plot
    create_time_series_plot_by_time(clean_data)

    # Create choropleth map
    create_choropleth_map(clean_data, geojson_path)
    
    # apply Model
    apply_model(clean_data)
    
if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'geopandas'