In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

sns.set_palette("Set2")
sns.set_style("ticks")

## Load Data

In [13]:
crimes = pd.read_csv("../data/train.zip")

In [14]:
weather = pd.read_csv("../data/sf_weather.csv")

In [15]:
football = pd.read_csv("../data/49schedule.csv")

## Data Preprocessing 
Let's begin by removing missing values.

In [16]:
crimes_missing = pd.DataFrame(crimes.isnull().mean()*100, columns=["Missing Values %"])
crimes_missing

Unnamed: 0,Missing Values %
Dates,0.0
Category,0.0
Descript,0.0
DayOfWeek,0.0
PdDistrict,0.0
Resolution,0.0
Address,0.0
X,0.0
Y,0.0


In [17]:
crimes_original = crimes.copy()
crimes_original["Dates"] = pd.to_datetime(crimes["Dates"])
crimes_original.set_index(keys=crimes_original["Dates"], inplace=True)
crimes_original.drop(columns=["Dates"], inplace=True)

In [18]:
crimes.drop(columns=["Category", "Descript", "Resolution", "Address", "X", "Y"], inplace=True)
crimes["Dates"] = pd.to_datetime(crimes["Dates"]).dt.date

In [19]:
crimes.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict
0,2015-05-13,Wednesday,NORTHERN
1,2015-05-13,Wednesday,NORTHERN
2,2015-05-13,Wednesday,NORTHERN
3,2015-05-13,Wednesday,NORTHERN
4,2015-05-13,Wednesday,PARK


Now let's load the weather data.

In [20]:
weather_missing = pd.DataFrame(weather.isnull().mean()*100, columns=["Missing Values %"])
weather_missing

Unnamed: 0,Missing Values %
date,0.0
avg_temp (celsius),0.0
precipitation (mm),0.0
wind_speed (km/h),0.0
visibility (km),0.0
"moon_illumination, %",0.0


In [21]:
weather["Dates"] = pd.to_datetime(weather["date"]).dt.date

In [22]:
weather_original = weather.copy()
weather_original["Dates"] = pd.to_datetime(weather["Dates"])
weather_original.set_index(keys=weather_original["Dates"], inplace=True)
weather_original.drop(columns=["Dates"], inplace=True)

In [23]:
weather.drop(columns=["date", "moon_illumination, %"], inplace=True)

In [24]:
weather.head()

Unnamed: 0,avg_temp (celsius),precipitation (mm),wind_speed (km/h),visibility (km),Dates
0,9,0.0,3,15.0,2003-01-01
1,10,0.0,5,15.0,2003-01-02
2,10,0.0,5,15.0,2003-01-03
3,11,0.0,5,15.0,2003-01-04
4,11,0.25,5,6.0,2003-01-05


Now let's merge the datasets into one final file.

In [28]:
data=pd.merge(crimes, weather, how="left", on="Dates")
data["Dates"] = pd.to_datetime(data["Dates"])
data=pd.merge(data, football, how="left", on="Dates") #Added this line
data["Dates"] = pd.to_datetime(data["Dates"])
data.set_index(keys=data["Dates"], inplace=True)
data.drop(columns=["Dates"], inplace=True)

In [29]:
data.rename(columns={"avg_temp (celsius)":"avg_temp",
             "precipitation (mm)":"precipitation",
             "wind_speed (km/h)":"wind_speed",
             "visibility (km)":"visibility"
             }, inplace=True)

In [30]:
data["precipitation"].replace({"T":np.nan}, inplace=True)
data["precipitation"] = data["precipitation"].astype("float")

In [31]:
data["wind_speed"].replace({"Waning Crescent, 14":np.nan}, inplace=True)
data["wind_speed"] = data["wind_speed"].astype("float")

In [32]:
data["NFL_Output"].fillna("No Game", inplace=True)
data["NFL_Home"].fillna(3, inplace=True) #3 Indicates no game for that day
data["NFL_Game_Day"].fillna(0, inplace=True)

In [33]:
data["NFL_Home"].replace({"0": 0,
                          "1":1,
                          "0.0":0,
                          "1.0":1}, inplace=True)

In [34]:
data["NFL_Game_Day"] = data["NFL_Game_Day"].astype("int")

In [35]:
data.head()

Unnamed: 0_level_0,DayOfWeek,PdDistrict,avg_temp,precipitation,wind_speed,visibility,NFL_Output,NFL_Home,NFL_Game_Day
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-05-13,Wednesday,NORTHERN,15,0.0,19.0,40.0,No Game,3,0
2015-05-13,Wednesday,NORTHERN,15,0.0,19.0,40.0,No Game,3,0
2015-05-13,Wednesday,NORTHERN,15,0.0,19.0,40.0,No Game,3,0
2015-05-13,Wednesday,NORTHERN,15,0.0,19.0,40.0,No Game,3,0
2015-05-13,Wednesday,PARK,15,0.0,19.0,40.0,No Game,3,0
