In [36]:
import pandas as pd
import numpy as np
import re
import requests

In [37]:
dataset_path = 'output/1000_downloaded_dataset.csv'
cleaned_dataset_path = 'output/1000_clean_dataset.csv'
total_occurrences_by_division_path = 'output/1000_total_occurrences_by_division.csv'

# Downloading the dataset

The dataset we are using comes from the [Central Statistics Office of Ireland](cso.ie)

The information we will be using from this dataset is:
* `Year`
* `County/Garda Station Division`
* `Type of offence`
* `Number of occurences`

Aswell as the index of a crime relative to its garda station called `Crime Index`

The dataset only includes countys from the **Republic of Ireland** and does not include *Northern Ireland*. `Cork` and `Dublin` are split up into seperate regions, north south etc, we can later combine these if neccessary. Some of the smaller countys are also grouped together like `Cavan/Monaghan`.

In [38]:
url = 'https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/CJA07/CSV/1.0/'
r = requests.get(url, allow_redirects=True)
open(dataset_path, 'wb').write(r.content)
print("Downloaded successfully at: "+dataset_path)

Downloaded successfully at: output/1000_downloaded_dataset.csv


# Cleaning the dataset

We need to remove any columns that have useless data, aswell as rename some of the poorly named columns for better readability later on.

Every cell in crime index gets subtracted by `3` to bring it into a more usable index range. *0-12 instead of 3-15*

The final lambda changes the cells in `Garda Station` from `[Station ID] [Town], [County] Division` to `[County] Division`, since we will not be using the `[Town]` or `[Station ID]` 

In [103]:
changes = {
    'Dublin':[
        'D.M.R. Western',
        'D.M.R. Eastern',
        'D.M.R. North Central',
        'D.M.R. Southern',
        'D.M.R. South Central',
        'D.M.R. Northern'
    ],
    'Cork':[
        'Cork City',
        'Cork West',
        'Cork North'
    ]
}

def clean_county(x):
    x = re.search('\s([^,]+$)',x).group(1).replace(' Division','')
    for new_county in changes:
        for old_county in changes[new_county]:
            x = x.replace(old_county,new_county)
    return x
    

In [104]:
dataset = pd.read_csv(dataset_path)
dataset = dataset.drop(columns=['STATISTIC', 'STATISTIC Label','UNIT','TLIST(A1)','C03037V03742'],axis=4)
dataset = dataset.rename(columns={'C02480V03003': 'Crime Index', 'VALUE': 'Occurrences'})
dataset['Crime Index'] = dataset['Crime Index'] - 3
dataset['Garda Station'] = dataset['Garda Station'].apply(clean_county)

In [113]:
dataset[dataset['Garda Station']=='Cavan/Monaghan']

Unnamed: 0,Year,Garda Station,Crime Index,Type of Offence,Occurrences
336,2003,Cavan/Monaghan,0,"Attempts/threats to murder, assaults, harassme...",4
337,2003,Cavan/Monaghan,1,Dangerous or negligent acts,10
338,2003,Cavan/Monaghan,2,Kidnapping and related offences,0
339,2003,Cavan/Monaghan,3,"Robbery, extortion and hijacking offences",0
340,2003,Cavan/Monaghan,4,Burglary and related offences,4
...,...,...,...,...,...
128467,2021,Cavan/Monaghan,7,Controlled drug offences,16
128468,2021,Cavan/Monaghan,8,Weapons and Explosives Offences,4
128469,2021,Cavan/Monaghan,9,Damage to property and to the environment,14
128470,2021,Cavan/Monaghan,10,Public order and other social code offences,18


In [66]:
total_divisions = dataset['Garda Station'].nunique() - 7 + 5
total_divisions

26

In [107]:
dataset['Garda Station'].unique()

array(['Limerick', 'Laois/Offaly', 'Waterford', 'Galway', 'Donegal',
       'Kerry', 'Cork', 'Louth', 'Tipperary', 'Wicklow', 'Cavan/Monaghan',
       'Meath', 'Roscommon/Longford', 'Westmeath', 'Kildare', 'Dublin',
       'Mayo', 'Sligo/Leitrim', 'Kilkenny/Carlow', 'Wexford', 'Clare'],
      dtype=object)

In [318]:
def split_county(x,i):
    match = re.search('(.+)\/(.+)',x['Garda Station'])
    if match:
        x['Garda Station'] = match.group(i)
        x['Occurrences'] = x['Occurrences']/2
        #new_df = pd.DataFrame({'Garda Station':[match.group(2)],'Occurrences':[int(x['Occurrences'])]})
        #sums = sums.merge(new_df, how='inner')
        #pd.concat([sums, pd.DataFrame([[match.group(2),x['Occurrences']]],columns=['Garda Station', 'Occurrences'])],ignore_index=True)
    return x  

In [327]:
sums = dataset.groupby(by='Garda Station')['Occurrences'].sum()
sums = sums.to_frame().reset_index()
sums.head()
pairs = sums[sums['Garda Station'].str.contains('/')].copy()
pairs = pairs.apply(lambda x: split_county(x,2),axis=1)
sums = sums.apply(lambda x: split_county(x,1),axis=1)
#mask = sums['Garda Station'].str.contains('/')
#sums[mask] = sums[mask].apply(lambda x: split_county(x,1))
sums = sums.merge(pairs,how='outer')

In [328]:
sums

Unnamed: 0,Garda Station,Occurrences
0,Cavan,51026.0
1,Clare,78677.0
2,Cork,418178.0
3,Donegal,114465.0
4,Dublin,1862658.0
5,Galway,178901.0
6,Kerry,106766.0
7,Kildare,159499.0
8,Kilkenny,59876.0
9,Laois,59040.0


In [329]:
dataset.head()

Unnamed: 0,Year,Garda Station,Crime Index,Type of Offence,Occurrences
0,2003,Limerick,0,"Attempts/threats to murder, assaults, harassme...",18
1,2003,Limerick,1,Dangerous or negligent acts,14
2,2003,Limerick,2,Kidnapping and related offences,0
3,2003,Limerick,3,"Robbery, extortion and hijacking offences",0
4,2003,Limerick,4,Burglary and related offences,27


In [331]:
dataset.to_csv(cleaned_dataset_path,index=False)
sums.to_csv(total_occurrences_by_division_path,index=False)
print("Saved both datasets")

Saved both datasets


In [332]:
sums

Unnamed: 0,Garda Station,Occurrences
0,Cavan,51026.0
1,Clare,78677.0
2,Cork,418178.0
3,Donegal,114465.0
4,Dublin,1862658.0
5,Galway,178901.0
6,Kerry,106766.0
7,Kildare,159499.0
8,Kilkenny,59876.0
9,Laois,59040.0
