# Importing libraries

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import matplotlib.pyplot as plt
from time import time
import seaborn as sns
plt.style.use('seaborn-darkgrid')
tqdm.pandas()

# Loading the data
The warning is fine because columns with mixed types get converted to objects. 
We'll make sure everything has the right type in the next cell.

In [6]:
nrows = 10000 #Sizing down the dataset is useful for prototyping and evaluating computational times
df = pd.read_csv('Fire_Department_Calls_for_Service.csv', nrows=nrows)

# Data cleaning 
Everything not explicitly handled here should have type "object"

In [10]:
#Data dictionary so we can make sure everything is the right type
data_dict = pd.read_csv('DataDictionary.csv')

#List of datetime columns
date_columns = list(data_dict[data_dict['Data Type'] == 'Date & Time']['Field Name'])

#This column appears to be missing
date_columns.remove('AVL Validated On Scene DtTm')

#Converting to datetimes
for column in date_columns:
    #Specifying the formats makes date parsing way faster even though it's not elegant
    if "Date" in column:
        df[column] = pd.to_datetime(df[column], format='%m/%d/%Y')
    else:
        df[column] = pd.to_datetime(df[column], format='%m/%d/%Y %I:%M:%S %p')
        
#List of numeric columns
numeric_columns = list(data_dict[data_dict['Data Type'] == 'Numeric']['Field Name'])

#Really these should be ints, but the float type lets us leave missing values as nans
df[numeric_columns] = df[numeric_columns].astype(float)
       
#List of boolean columns
bool_columns = list(data_dict[data_dict['Data Type'] == 'Boolean (True/False)']['Field Name'])
df[bool_columns] = df[bool_columns].astype(bool)

#Serialize the cleaned dataframe so we don't have to wait as long if we need to reload data
df.to_pickle('cleaned_fire_data')
        