# Day 1 Laboratory - Familiarise with Jupyter and Pandas

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load dataset and explore it

In general, search on the documentation online
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html?highlight=csv#pandas.read_csv
or type `?<func_name>` for help in here. For example `?pd.read_csv`.

In [None]:
url = 'https://data.london.gov.uk/download/animal-rescue-incidents-attended-by-lfb/01007433-55c2-4b8a-b799-626d9e3bc284/Animal%20Rescue%20incidents%20attended%20by%20LFB%20from%20Jan%202009.csv' 
df = pd.read_csv(url)

In [None]:
df

In [None]:
df.info()

## Data preparation

### Null values management

In [None]:
df.isna().sum()

In [None]:
df[(df.PumpCount.isna()) & (df.PumpHoursTotal.isna())]

In [None]:
df[df['IncidentNotionalCost(£)'] == df['HourlyNotionalCost(£)'] * df.PumpHoursTotal]

In [None]:
df.dropna(subset=['PumpCount', 'PumpHoursTotal'])

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
df.plot.hist(y='PumpCount', ax=axs[0])
df.plot.hist(y='PumpHoursTotal', ax=axs[1])
plt.plot()

In [None]:
df.fillna({'PumpCount': 1, 'PumpHoursTotal': 1}, inplace=True)

In [None]:
df['IncidentNotionalCost(£)'] = df['HourlyNotionalCost(£)'] * df.PumpHoursTotal

In [None]:
df.isna().sum()

### Reduction and optimisation

In [None]:
df.nunique().sort_values()

In [None]:
df.TypeOfIncident.unique()

In [None]:
df.drop(columns=['TypeOfIncident'], inplace=True)

In [None]:
df['AnimalGroupParent'].unique()

In [None]:
df.loc[df['AnimalGroupParent'] == 'cat', 'AnimalGroupParent'] = 'Cat'

In [None]:
df.loc[df['AnimalGroupParent'] == 'Unknown - Domestic Animal Or Pet', 'AnimalGroupParent'] = 'Domestic'

### Type conversion

In [None]:
df.DateTimeOfCall

In [None]:
pd.to_datetime(df['DateTimeOfCall']).head()

In [None]:
pd.to_datetime(df['DateTimeOfCall']).plot()

In [None]:
df['DateTimeOfCall'] = pd.to_datetime(df['DateTimeOfCall'])

In [None]:
df.set_index('DateTimeOfCall', inplace=True)

In [None]:
df

In [None]:
df.loc['2021-01-01':'2021-01-31', 'FinalDescription']

## Visualisation

In [None]:
df.resample('M')['IncidentNumber'].count().plot(title='Monthly Calls')

In [None]:
df.resample('M')['IncidentNumber'].count().sort_values(ascending=False).head(20)

In [None]:
fig, axs = plt.subplots(figsize=(16, 4), ncols=2)
df.resample('Y')['IncidentNotionalCost(£)'].sum().plot(
    title='Year total cost', ax=axs[0]
)
df.resample('Y')['HourlyNotionalCost(£)'].mean().plot(
    title='Average hourly cost', ax=axs[1]
)
# plt.show()

In [None]:
df.groupby('AnimalGroupParent')['IncidentNumber'].count().sort_values().plot.barh(logx=True)

In [None]:
df['HourlyNotionalCost(£)'].plot.line()

In [None]:
df.groupby('StnGroundName')['IncidentNumber'].count()

### Install this for later `pip install geopandas contextily`

In [None]:
# drop missing longitude/latitude
df2 = df.dropna(subset=['Longitude', 'Latitude'])
# also drop zero values
df2 = df2[df2['Latitude'] != 0]
# convert to geodataframe using geopandas
import geopandas

# set crs to EPSG:4326 to specify WGS84 Latitude/Longitude
gdf = geopandas.GeoDataFrame(
    df2,
    geometry=geopandas.points_from_xy(df2['Longitude'], df2['Latitude']),
    crs='EPSG:4326',
)
gdf.head()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color='black', alpha=0.3)
plt.title('Call locations')
# plt.axis('off')
plt.show()

In [None]:
import contextily as cx

f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
gdf.plot(ax=ax, color='black', alpha=0.3)
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title('Call locations')
plt.axis('off')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(16, 16))
# plot location of calls involving animals
for animal, colour in [
    ('Cow', 'black'),
    ('Deer', 'red'),
    ('Fox', 'blue'),
    ('Snake', 'yellow'),
    # ('Cat', 'teal'),
    # ('Dog', 'brown')
]:
    gdf[gdf['AnimalGroupParent'] == animal].plot(
        ax=ax, color=colour, alpha=0.5, label=animal
    )
# add a basemap of the region using contextily
cx.add_basemap(ax, crs=gdf.crs)
plt.title('Call locations by animal')
plt.legend()
plt.axis('off')
plt.show()