In [1]:
%pip install geopandas pandas matplotlib folium numpy

import os
import geopandas as gpd
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Ensure that the data is in the correct directory
data_path = "raw/"
if os.path.exists(data_path):
    print("Found the directory! Here's what's inside:")
    print(os.listdir(data_path))
else:
    print("Directory not found. Check your map and compass!")

# Assuming the directory exists, let's proceed to load the files
# ADD REMOVE FILES HERE AS NEEDED
file_names = [
    "CrimeData-2015.csv",
    "CrimeData-2016.csv",
    "CrimeData-2017.csv",
    "CrimeData-2018.csv",
    "CrimeData-2019.csv",
    "CrimeData-2020.csv",
    "CrimeData-2021.csv",
    "CrimeData-2022.csv",
    "CrimeData-2023.csv"
]

# Correctly construct the full file paths
all_files = [os.path.join(data_path, file_name) for file_name in file_names]
print(all_files)
# Load and concatenate the CSV files into a single DataFrame
try:
    crimes = pd.concat((pd.read_csv(f) for f in all_files))
    # Display the first three rows to verify the data
    print(crimes.head(3))
except FileNotFoundError as e:
    print(f"Error loading files: {e}")

Found the directory! Here's what's inside:
['aggregated_crime_data23v3.csv', 'all_crimes.csv', 'CrimeData-00-2018.csv', 'CrimeData-2015.csv', 'CrimeData-2016.csv', 'CrimeData-2017.csv', 'CrimeData-2018.csv', 'CrimeData-2019.csv', 'CrimeData-2020.csv', 'CrimeData-2021.csv', 'CrimeData-2022.csv', 'CrimeData-2023.csv', 'Neighborhood_Boundaries.cpg', 'Neighborhood_Boundaries.dbf', 'Neighborhood_Boundaries.geojson', 'Neighborhood_Boundaries.prj', 'Neighborhood_Boundaries.shp', 'Neighborhood_Boundaries.shx']
['raw/CrimeData-2015.csv', 'raw/CrimeData-2016.csv', 'raw/CrimeData-2017.csv', 'raw/CrimeData-2018.csv', 'raw/CrimeData-2019.csv', 'raw/CrimeData-2020.csv', 'raw/CrimeData-2021.csv', 'raw/CrimeData-2022.csv', 'raw/CrimeData-2023.csv']
  Address   CaseNumber CrimeAgainst     Neighborhood  OccurDate  OccurTime  \
0     NaN   15-X197430       Person         Piedmont  5/12/2015       1400   
1     NaN  15-X4282999       Person     Buckman West   5/1/2015       2143   
2     NaN  15-X4283033 

In [3]:
# Ensure 'OccurDate' is a datetime type
crimes['OccurDate'] = pd.to_datetime(crimes['OccurDate'])
# Extract month and year from 'OccurDate'
crimes['Month'] = crimes['OccurDate'].dt.month
crimes['Year'] = crimes['OccurDate'].dt.year
crimes['Month_Year'] = crimes['OccurDate'].dt.strftime('%Y-%m')

In [5]:
neighborhoods_gdf = gpd.read_file('raw/Neighborhood_Boundaries.shp')

In [6]:
crimes['Neighborhood'] = crimes['Neighborhood'].str.lower().str.strip()
neighborhoods_gdf['MAPLABEL'] = neighborhoods_gdf['MAPLABEL'].str.lower().str.strip()

In [7]:
# Assuming 'neighborhood_column_in_csv' and 'neighborhood_column_in_shapefile' are the correct column names
unique_crime_neighborhoods = set(crimes['Neighborhood'].unique())
unique_shapefile_neighborhoods = set(neighborhoods_gdf['MAPLABEL'].unique())

In [8]:
# Names in crime data not found in shapefile
missing_in_shapefile = unique_crime_neighborhoods - unique_shapefile_neighborhoods
# Names in shapefile not found in crime data
missing_in_crime_data = unique_shapefile_neighborhoods - unique_crime_neighborhoods

missing_in_shapefile_sorted = sorted([str(item) for item in missing_in_shapefile])
missing_in_crime_data_sorted = sorted([str(item) for item in missing_in_crime_data])

print("Names in crime data not found in shapefile:", missing_in_shapefile_sorted)
print("Names in shapefile not found in crime data:", missing_in_crime_data_sorted)

Names in crime data not found in shapefile: ['ardenwald', 'argay', 'buckman east', 'buckman west', 'downtown', 'healy heights', 'mt scott-arleta', 'mt tabor', 'nan', 'northwest', 'northwest industrial', 'old town/chinatown', 'pearl', 'st johns']
Names in shapefile not found in crime data: ['alameda/beaumont-wilshire', 'alameda/irvington', 'ardenwald-johnson creek', 'ardenwald-johnson creek/woodstock', 'argay terrace', 'argay terrace/wilkes', 'arlington heights/sylvan-highlands', 'ashcreek/crestwood', 'boise/eliot', 'bridlemile/southwest hills', 'buckman', 'centennial/pleasant valley', 'eastmoreland/ardenwald-johnson creek', 'eastmoreland/reed', 'forest park/linnton', 'forest park/northwest district', 'goose hollow/southwest hills', 'grant park/hollywood', 'hazelwood/mill park', 'healy heights/southwest hills', 'hillside/northwest district', 'lents/powellhurst-gilbert', "lloyd/sullivan's gulch", 'mc unclaimed #11', 'mc unclaimed #13', 'mc unclaimed #14', 'mc unclaimed #5', 'mt. scott-ar

In [9]:
name_mapping = {
    'ardenwald': 'ardenwald-johnson creek',
    'argay': 'argay terrace',
    'buckman east': 'buckman',  # Assuming you want to merge east/west
    'buckman west': 'buckman',  # Assuming you want to merge east/west
    'downtown': 'portland downtown',
    'healy heights': 'healy heights/southwest hills',
    'mt scott-arleta': 'mt. scott-arleta',
    'mt tabor': 'mt. tabor',
    'northwest': 'northwest district',
    'northwest industrial': 'northwest district',  # Assuming you want to merge; adjust based on your analysis
    'old town/chinatown': 'old town',  # Decide based on which name is more encompassing or split entries
    'pearl': 'pearl district',
    'st johns': 'st. johns',
    'nan': None  # Decide how to handle 'nan'; perhaps exclude these records or map them to a placeholder
}

In [10]:
crimes['Neighborhood'] = crimes['Neighborhood'].map(name_mapping).fillna(crimes['Neighborhood'])

In [11]:
# Assuming 'neighborhood_column_in_csv' and 'neighborhood_column_in_shapefile' are the correct column names
unique_crime_neighborhoods = set(crimes['Neighborhood'].unique())
unique_shapefile_neighborhoods = set(neighborhoods_gdf['MAPLABEL'].unique())
# Names in crime data not found in shapefile
missing_in_shapefile = unique_crime_neighborhoods - unique_shapefile_neighborhoods
# Names in shapefile not found in crime data
missing_in_crime_data = unique_shapefile_neighborhoods - unique_crime_neighborhoods

missing_in_shapefile_sorted = sorted([str(item) for item in missing_in_shapefile])
missing_in_crime_data_sorted = sorted([str(item) for item in missing_in_crime_data])

print("Names in crime data not found in shapefile:", missing_in_shapefile_sorted)
print("Names in shapefile not found in crime data:", missing_in_crime_data_sorted)

Names in crime data not found in shapefile: ['nan']
Names in shapefile not found in crime data: ['alameda/beaumont-wilshire', 'alameda/irvington', 'ardenwald-johnson creek/woodstock', 'argay terrace/wilkes', 'arlington heights/sylvan-highlands', 'ashcreek/crestwood', 'boise/eliot', 'bridlemile/southwest hills', 'centennial/pleasant valley', 'eastmoreland/ardenwald-johnson creek', 'eastmoreland/reed', 'forest park/linnton', 'forest park/northwest district', 'goose hollow/southwest hills', 'grant park/hollywood', 'hazelwood/mill park', 'hillside/northwest district', 'lents/powellhurst-gilbert', "lloyd/sullivan's gulch", 'mc unclaimed #11', 'mc unclaimed #13', 'mc unclaimed #14', 'mc unclaimed #5', 'pleasant valley/powellhurst-gilbert', 'roseway/madison south', 'sabin/irvington', "sullivan's gulch/grant park", 'sylvan-highlands/southwest hills']


In [12]:
# Violent and non-violent classification
crime_categories = {
    # Violent Crimes
    'Assault Offenses': 'Violent',
    'Robbery': 'Violent',
    'Homicide Offenses': 'Violent',
    'Kidnapping/Abduction': 'Violent',
    'Animal Cruelty Offenses': 'Violent', # Adjust based on severity
    'Human Trafficking Offenses': 'Violent',
    'Sex Offenses': 'Violent',

    # Non-Violent Crimes
    'Larceny Offenses': 'Non-Violent',
    'Motor Vehicle Theft': 'Non-Violent',
    'Burglary': 'Non-Violent',
    'Fraud Offenses': 'Non-Violent',
    'Weapon Law Violations': 'Non-Violent', # May be violent in specific contexts
    'Drug/Narcotic Offenses': 'Non-Violent', # May be violent in specific contexts
    'Counterfeiting/Forgery': 'Non-Violent',
    'Prostitution Offenses': 'Non-Violent',
    'Stolen Property Offenses': 'Non-Violent',
    'Embezzlement': 'Non-Violent',
    'Extortion/Blackmail': 'Non-Violent',
    'Bribery': 'Non-Violent',
    'Pornography/Obscene Material': 'Non-Violent',

    # Grey Areas (Categorize based on specific data and context)
    'Vandalism': 'Vandalism',

}

# Map each crime to its category
crimes['CrimeCategory'] = crimes['OffenseCategory'].map(crime_categories)

In [13]:
# Aggregate data
aggregated_data = crimes.groupby(['Neighborhood', 'Year', 'CrimeCategory']).size().unstack(level=[1, 2]).fillna(0)

# Flatten the DataFrame and create column names as specified
new_column_names = [f'{year}_{category}' for category, year in aggregated_data.columns]
aggregated_data.columns = new_column_names
print(aggregated_data)

                         Non-Violent_2015  Vandalism_2015  Violent_2015  \
Neighborhood                                                              
alameda                              85.0            12.0           7.0   
arbor lodge                         266.0            27.0          36.0   
ardenwald-johnson creek              19.0             0.0           1.0   
argay terrace                       301.0            32.0          61.0   
arlington heights                    56.0             8.0           2.0   
...                                   ...             ...           ...   
west portland park                   38.0            10.0          17.0   
wilkes                              247.0            40.0          45.0   
woodland park                        14.0             2.0           4.0   
woodlawn                            108.0            23.0          34.0   
woodstock                           285.0            43.0          41.0   

                        

In [14]:
aggregated_data.reset_index(inplace=True)

In [15]:
print(aggregated_data.head())

              Neighborhood  Non-Violent_2015  Vandalism_2015  Violent_2015  \
0                  alameda              85.0            12.0           7.0   
1              arbor lodge             266.0            27.0          36.0   
2  ardenwald-johnson creek              19.0             0.0           1.0   
3            argay terrace             301.0            32.0          61.0   
4        arlington heights              56.0             8.0           2.0   

   Non-Violent_2016  Vandalism_2016  Violent_2016  Non-Violent_2017  \
0             125.0            10.0           4.0             123.0   
1             454.0            40.0          51.0             423.0   
2              30.0             3.0           5.0              32.0   
3             454.0            77.0          87.0             560.0   
4             140.0            10.0           3.0              46.0   

   Vandalism_2017  Violent_2017  ...  Violent_1975  Vandalism_1967  \
0            13.0          13.0  .

In [16]:
# Perform an attribute join
joined_gdf = neighborhoods_gdf.merge(aggregated_data, how='left', left_on='MAPLABEL', right_on='Neighborhood')

In [17]:
# Remove rows where 'Neighborhood' is null
cleaned_gdf = joined_gdf.dropna(subset=['Neighborhood'])

In [18]:
# Preview the first few rows
print(cleaned_gdf.head())

# Check for missing values in key columns, adjust column names as needed
print(cleaned_gdf[['MAPLABEL', 'Neighborhood']].isnull().sum())

   OBJECTID             NAME COMMPLAN SHARED COALIT HORZ_VERT    Shape_Leng  \
0         1          LINNTON     None      N   NWNW      HORZ  53244.045538   
2         3      FOREST PARK     None      N   NWNW      HORZ  82725.497522   
3         4   CATHEDRAL PARK     None      N   NPNS      HORZ  11434.254777   
4         5  UNIVERSITY PARK     None      N   NPNS      HORZ  11950.859827   
6         7         PIEDMONT   ALBINA      N   NPNS      VERT  10849.327392   

          MAPLABEL  ID                                           geometry  \
0          linnton   1  POLYGON ((-13669901.863 5719574.412, -13669574...   
2      forest park   3  POLYGON ((-13669096.892 5708277.357, -13669210...   
3   cathedral park   4  POLYGON ((-13666093.373 5714122.821, -13666569...   
4  university park   5  POLYGON ((-13663193.115 5713925.450, -13662883...   
6         piedmont   7  POLYGON ((-13656168.594 5714345.257, -13656114...   

   ... Violent_1975  Vandalism_1967  Non-Violent_1981  Non-Vio

In [36]:
# Convert to centroids if necessary
cleaned_gdf['geometry'] = cleaned_gdf.geometry.centroid

In [20]:
cleaned_gdf.to_file('PDX_Crime15_23_Hood.geojson', driver='GeoJSON')
