In [20]:
import pandas as pd
import numpy as np
import re
import requests

In [52]:
dataset_path = '1000_cleaning/output/1000_downloaded_dataset.csv'
cleaned_dataset_path = '1000_cleaning/output/1000_clean_dataset.csv'
total_occurrences_by_division_path = '1000_cleaning/output/1000_total_occurrences_by_division.csv'

# Downloading the dataset

The dataset we are using comes from the [Central Statistics Office of Ireland](cso.ie)

The information we will be using from this dataset is:
* `Year`
* `County/Garda Station Division`
* `Type of offence`
* `Number of occurences`

Aswell as the index of a crime relative to its garda station called `Crime Index`

The dataset only includes countys from the **Republic of Ireland** and does not include *Northern Ireland*. `Cork` and `Dublin` are split up into seperate regions, north south etc, we can later combine these if neccessary. Some of the smaller countys are also grouped together like `Cavan/Monaghan`.

In [67]:
url = 'https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/CJA07/CSV/1.0/'
r = requests.get(url, allow_redirects=True)
open(dataset_path, 'wb').write(r.content)
print("Downloaded successfully at: "+dataset_path)

Downloaded successfully at: 1000_cleaning/output/1000_downloaded_dataset.csv


# Cleaning the dataset

We need to remove any columns that have useless data, aswell as rename some of the poorly named columns for better readability later on.

Every cell in crime index gets subtracted by `3` to bring it into a more usable index range. *0-12 instead of 3-15*

The final lambda changes the cells in `Garda Station` from `[Station ID] [Town], [County] Division` to `[County] Division`, since we will not be using the `[Town]` or `[Station ID]` 

In [68]:
dataset = pd.read_csv(dataset_path)
dataset = dataset.drop(columns=['STATISTIC', 'STATISTIC Label','UNIT','TLIST(A1)','C03037V03742'],axis=4)
dataset = dataset.rename(columns={'C02480V03003': 'Crime Index', 'VALUE': 'Occurrences'})
dataset['Crime Index'] = dataset['Crime Index'] - 3
dataset['Garda Station'] = dataset['Garda Station'].apply(lambda x: re.search('\s([^,]+$)',x).group(1))

In [40]:
dataset.head()

Unnamed: 0,Year,Garda Station,Crime Index,Type of Offence,Occurrences
0,2003,Limerick Division,0,"Attempts/threats to murder, assaults, harassme...",18
1,2003,Limerick Division,1,Dangerous or negligent acts,14
2,2003,Limerick Division,2,Kidnapping and related offences,0
3,2003,Limerick Division,3,"Robbery, extortion and hijacking offences",0
4,2003,Limerick Division,4,Burglary and related offences,27


In [66]:
total_divisions = dataset['Garda Station'].nunique() - 7 + 5
total_divisions

26

In [61]:
dataset['Garda Station'].unique()

array(['Limerick Division', 'Laois/Offaly Division', 'Waterford Division',
       'Galway Division', 'Donegal Division', 'Kerry Division',
       'Cork City Division', 'Louth Division', 'Tipperary Division',
       'Wicklow Division', 'Cavan/Monaghan Division', 'Meath Division',
       'Roscommon/Longford Division', 'Westmeath Division',
       'Kildare Division', 'Cork West Division',
       'D.M.R. Northern Division', 'Mayo Division',
       'Sligo/Leitrim Division', 'Kilkenny/Carlow Division',
       'Cork North Division', 'Wexford Division',
       'D.M.R. Western Division', 'Clare Division',
       'D.M.R. Eastern Division', 'D.M.R. North Central Division',
       'D.M.R. Southern Division', 'D.M.R. South Central Division'],
      dtype=object)

In [56]:
sums = dataset.groupby(by='Garda Station')['Occurrences'].sum()

In [64]:
sums

Garda Station
Cavan/Monaghan Division          102052
Clare Division                    78677
Cork City Division               264864
Cork North Division               89914
Cork West Division                63400
D.M.R. Eastern Division          172159
D.M.R. North Central Division    374731
D.M.R. Northern Division         325433
D.M.R. South Central Division    351324
D.M.R. Southern Division         272861
D.M.R. Western Division          366150
Donegal Division                 114465
Galway Division                  178901
Kerry Division                   106766
Kildare Division                 159499
Kilkenny/Carlow Division         119752
Laois/Offaly Division            118080
Limerick Division                233306
Louth Division                   132946
Mayo Division                     71711
Meath Division                   111354
Roscommon/Longford Division       61205
Sligo/Leitrim Division            68778
Tipperary Division               110682
Waterford Division        

In [58]:
dataset.to_csv(cleaned_dataset_path)
sums.to_csv(total_occurrences_by_division_path)