## NFL Arrests Visualization Project
Allen Butt

## Dataset 1

In [None]:
import pandas as pd
import re
import numpy as np
data = pd.read_csv("nfl_arrests_2011-2015.csv", encoding = 'unicode_escape')

In [None]:
#Fix missing data in OT_flag and turn it into a numeric variable
data.fillna({'OT_flag':0}, inplace=True)
data['OT_flag'] = data['OT_flag'].replace(['OT'],1)
data["OT_flag"]=pd.to_numeric(data["OT_flag"])

#Update "division_game" into numeric as well
data['division_game'] = data['division_game'].replace(['n'],0)
data['division_game'] = data['division_game'].replace(['y'],1)
data["division_game"]=pd.to_numeric(data["division_game"])
data.head()

In [None]:
#Some observations have missing data--they should be dropped from the dataframe.
data = data[data['arrests'].notna()]

In [None]:
#Some games were played in London and so have missing data.
#We can impute the missing values taking the mean of the arrests of the same year for that team.

#Create a function to help with this process
def imputeLondon(year, home, away, homescore, awayscore, OT, division):
    new = data[(data['home_team'] == home) & (data['season'] == year )]["arrests"].mean()
    data.loc[len(data)] = np.array([year,0,0,0,home,away,homescore,awayscore,OT,new,division])
    return;

In [None]:
#Use the function to fill in the missing data with imputed values.
imputeLondon(2013, "Arizona", "Houston", 30, 9, 0, 0)
imputeLondon(2013, "Jacksonville", "San Francisco", 10, 42, 0, 0)
imputeLondon(2014, "Jacksonville", "Dallas", 17, 32, 0, 0)
imputeLondon(2015, "Jacksonville", "Buffalo", 34, 31, 0, 0)
imputeLondon(2015, "Kansas City", "Detroit", 45, 10, 0, 0)
imputeLondon(2015, "Miami", "New York Jets", 14, 27, 0, 1)
imputeLondon(2014, "Oakland", "Miami", 14, 38, 0, 0)
imputeLondon(2014, "Oakland", "Denver", 17, 41, 0, 1)
imputeLondon(2014, "Oakland", "Kansas City", 24, 20, 0, 0)
imputeLondon(2011, "Tampa Bay", "Chicago", 18, 24, 0, 0)

In [None]:
#Three teams had a missing year of data--we can impute this data by taking the mean of the existing years.
def imputeYear(year, home, away, homescore, awayscore, OT, division):
    new = data[(data['home_team'] == home)]["arrests"].mean()
    data.loc[len(data)] = np.array([year,0,0,0,home,away,homescore,awayscore,OT,pd.to_numeric(new),division])
    data["arrests"]=pd.to_numeric(data["arrests"]) #kept getting type errors without brute-forcing it
    return;

imputeYear(2012, "Baltimore", "Cincinnati", 44, 13, 0, 1)
imputeYear(2012, "Baltimore", "New England", 31, 30, 0, 0)
imputeYear(2012, "Baltimore", "Cleveland", 23, 16, 0, 1)
imputeYear(2012, "Baltimore", "Dallas", 31, 29, 0, 1)
imputeYear(2012, "Baltimore", "Oakland", 55, 20, 0, 0)
imputeYear(2012, "Baltimore", "Pittsburgh", 20, 23, 0, 1)
imputeYear(2012, "Baltimore", "Denver", 17, 34, 0, 0)
imputeYear(2012, "Baltimore", "New York Giants", 33, 14, 0, 0)

imputeYear(2015, "Chicago", "Green Bay", 23, 31, 0, 1)
imputeYear(2015, "Chicago", "Arizona", 23, 48, 0, 0)
imputeYear(2015, "Chicago", "Oakland", 22, 20, 0, 0)
imputeYear(2015, "Chicago", "Minnesota", 20, 23, 0, 1)
imputeYear(2015, "Chicago", "Denver", 15, 17, 0, 0)
imputeYear(2015, "Chicago", "San Francisco", 20, 26, 1, 0)
imputeYear(2015, "Chicago", "Washington", 21, 24, 0, 0)
imputeYear(2015, "Chicago", "Detroit", 20, 24, 0, 1)

imputeYear(2011, "Miami", "New England", 24, 38, 0, 1)
imputeYear(2011, "Miami", "Houston", 13, 23, 0, 0)
imputeYear(2011, "Miami", "Denver", 15, 18, 1, 0)
imputeYear(2011, "Miami", "Washington", 2, 9, 0, 0)
imputeYear(2011, "Miami", "Buffalo", 35, 8, 0, 1)
imputeYear(2011, "Miami", "Oakland", 34, 14, 0, 0)
imputeYear(2011, "Miami", "Philadelphia", 10, 26, 0, 0)
imputeYear(2011, "Miami", "New York Jets", 19, 17, 0, 1)

In [None]:
#For one of the visualizations, we need to sort the dataset according to team, then year, then week--then an index will need
#to be added to keep things properly sorted.
data = data.sort_values(by = ["home_team","season", "week_num"])

data["Index_num"] = 0
for snuh in range(0,len(data)):
    data.iat[snuh,11] = snuh

In [None]:
#Export Dataframe to CSV
data.to_csv(r'nfl_arrests.csv', index = False)

## Dataset 2

In [None]:
#New Dataset, NFL Player Arrests
data2 = pd.read_csv("nfl_player_arrests.csv", encoding = 'unicode_escape')
#Check out the data, look for Missing Data
data2.head()

In [None]:
#We need to standardize Team Names--importing a new csv file with two columns to help ease the transition
data3 = pd.read_csv("nfl_names_conversion.csv", encoding = 'unicode_escape')
data3.head()

In [None]:
#Loop through each row in this small dataset, and change obervations in data2 that match "Team Name" to "Team City".
#Also add a new column to data3 that selects 1 for items that were matched. This will allow us to delete all observations
#with teams outside of our dataset easily.

data2["Found"] = 0
for meh in range(0,len(data3)):
    teamname = data3["Team_Name"][meh]
    for bleh in range(0,len(data2)):
        if data2.iloc[bleh]['TEAM'] == teamname:
            data2.iat[bleh,8] = 1
            data2.iat[bleh,1] = data3.iloc[meh]["Team_City"]

data2.drop(data2[data2['Found'] == 0].index, inplace = True) 
data2.head(20)

In [None]:
#Export Dataframe to CSV
data2.to_csv(r'nfl_players.csv', index = False)