In [312]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import re

nhl_df=pd.read_csv("assets/nfl.csv")
cities=pd.read_html("assets/wikipedia_data.html")[1]
cities=cities.iloc[:-1,[0,3,5,6,7,8]]

big4 = 'NFL'

In [313]:
sports = ['Metropolitan area', 'Population', 'NFL','MLB','NBA','NHL']
cities.columns = sports #rename the cities database columns
def cleaning(item):
    #print('working on function 1, string: {}'.format(item))
    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', item) #separate two words by the capital letter e.g ColdCase Cold Case
    return s

def cleaning_2(item):
    #print('working on function 2, string: {}'.format(item))
    s = re.sub('\[.*\]', '', item) #remove the squared bracket
    #s = re.sub(r'([a-z])([A-Z])', r'\1 \2', item)
    return s

def cleaning_3(item): #clean the ends
    #print('working on function 3, string: {}'.format(item))
    item.strip()
    s = re.sub('[\*\+]$', '', item)
    return s


def apply_cleaning(clen_fun, dataframe, columns): #applies all the cleaning functions on every col  
    for fun in clen_fun:
        for col in columns:
            dataframe[col]  = dataframe[col].astype('string').apply(fun)
    return dataframe

functions = [cleaning, cleaning_2, cleaning_3]

apply_cleaning(functions, cities, sports) # clean cities
cities.sort_values('Metropolitan area', axis = 0, inplace=True) #sort cities by their name
cities.reset_index(drop=True,inplace=True)
cities.head()

Unnamed: 0,Metropolitan area,Population,NFL,MLB,NBA,NHL
0,Atlanta,5789700,Falcons,Braves,Hawks,
1,Baltimore,2798886,Ravens,Orioles,,—
2,Boston,4794447,Patriots,Red Sox,Celtics,Bruins
3,Buffalo,1132804,Bills,,,Sabres
4,Calgary,1392609,—,—,—,Flames


In [314]:
city_replace = {'Dallas–Fort Worth':'Dallas','Miami–Fort Lauderdale':'Miami','Minneapolis–Saint Paul':'Minneapolis','New York City':'New York','San Francisco Bay Area':'San Francisco','Tampa Bay Area':'Tampa Bay','Washington, D.C.':'Washington','':'None','—':'None','— ':'None'}
#row above does some name cleaning for the cities
cities.replace(to_replace=city_replace,value = None, inplace=True)
cities #this is the cleaned database for cities and their teams

Unnamed: 0,Metropolitan area,Population,NFL,MLB,NBA,NHL
0,Atlanta,5789700,Falcons,Braves,Hawks,
1,Baltimore,2798886,Ravens,Orioles,,
2,Boston,4794447,Patriots,Red Sox,Celtics,Bruins
3,Buffalo,1132804,Bills,,,Sabres
4,Calgary,1392609,,,,Flames
5,Charlotte,2474314,Panthers,,Hornets,
6,Chicago,9512999,Bears,Cubs White Sox,Bulls,Blackhawks
7,Cincinnati,2165139,Bengals,Reds,,
8,Cleveland,2055612,Browns,Indians,Cavaliers,
9,Columbus,2041520,,,,Blue Jackets


In [315]:
nhl_df = nhl_df[nhl_df['year'] == 2018] #take only year 2018

In [316]:
nhl_df.drop([0,5,10,15,20,25,30,35], axis = 0, inplace= True)
nhl_df = nhl_df[['team','W','L',]] #take only the important stuff
replace_nhl = {'New England':'Boston','Carolina':'Charlotte','Minnesota':'Minneapolis','Tennessee':'Nashville','Arizona':'Phoenix','Oakland':'San Francisco'}
for i in nhl_df.columns:
    nhl_df[i] = nhl_df[i].str.strip()

In [317]:
apply_cleaning(functions, nhl_df, list(nhl_df.columns)) #clean the database
def splitname(row):
    row['Team']=row['team'].split(" ")[-1]
    s=row['team'].split(" ")[:-1]
    row['City']=' '.join(map(str, s)) #make it a single string instead of a list
    return row

nhl_df=nhl_df.apply(splitname, axis='columns')  #separe teams from cities
del(nhl_df['team'])
nhl_df

Unnamed: 0,W,L,Team,City
1,11,5,Patriots,New England
2,7,9,Dolphins,Miami
3,6,10,Bills,Buffalo
4,4,12,Jets,New York
6,10,6,Ravens,Baltimore
7,9,6,Steelers,Pittsburgh
8,7,8,Browns,Cleveland
9,6,10,Bengals,Cincinnati
11,11,5,Texans,Houston
12,10,6,Colts,Indianapolis


In [318]:
nhl_df.replace(to_replace=replace_nhl,value=None,inplace=True) #clean "special" teams' names
nhl_df = nhl_df.set_index('City')
nhl_df.sort_values('City', axis=0, inplace=True)
nhl_df

Unnamed: 0_level_0,W,L,Team
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlanta,7,9,Falcons
Baltimore,10,6,Ravens
Boston,11,5,Patriots
Buffalo,6,10,Bills
Charlotte,7,9,Panthers
Chicago,12,4,Bears
Cincinnati,6,10,Bengals
Cleveland,7,8,Browns
Dallas,10,6,Cowboys
Denver,6,10,Broncos


In [319]:
cities_nhl = cities[cities[big4] != 'None'] #take all the cities that have a "NBA" team
cities_nhl = cities_nhl[['Metropolitan area','Population',big4]] # filter only interesting parameters
cities_nhl = cities_nhl.set_index('Metropolitan area') 
cities_nhl.index.rename('City', inplace=True) 
print('nr of cities: {}, nr of {} teams: {}'.format(cities_nhl.shape[0],big4,nhl_df.shape[0]))

nr of cities: 29, nr of NFL teams: 32


### This passage here is to check for manual cleaning

In [320]:
#cities_nhl[~cities_nhl.index.isin(nhl_df.index)] #show cities that are not in the NBA dataframe
nhl_df[~nhl_df.index.isin(cities_nhl.index)] #Check what cities are not matching the names

Unnamed: 0_level_0,W,L,Team
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [321]:
cities_nhl = cities_nhl.merge(nhl_df, left_index=True, right_index=True)
cities_nhl.drop(labels=big4, axis = 1, inplace=True)
cities_nhl['W/L Ratio'] = cities_nhl['W'].astype('float64')/(cities_nhl['L'].astype('float64') + cities_nhl['W'].astype('float64'))
print(cities_nhl.shape)
cities_nhl


(32, 5)


Unnamed: 0_level_0,Population,W,L,Team,W/L Ratio
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Atlanta,5789700,7,9,Falcons,0.4375
Baltimore,2798886,10,6,Ravens,0.625
Boston,4794447,11,5,Patriots,0.6875
Buffalo,1132804,6,10,Bills,0.375
Charlotte,2474314,7,9,Panthers,0.4375
Chicago,9512999,12,4,Bears,0.75
Cincinnati,2165139,6,10,Bengals,0.375
Cleveland,2055612,7,8,Browns,0.466667
Dallas,7233323,10,6,Cowboys,0.625
Denver,2853077,6,10,Broncos,0.375


In [322]:
cities_nhl['Avg. Ratio'] = cities_nhl.groupby(cities_nhl.index).agg(np.mean)
cities_nhl['Population'] = cities_nhl['Population'].astype('int64')
corr1 = cities_nhl['Population'].corr(cities_nhl['Avg. Ratio'],method='pearson')
print('answer to question 1: {}'.format(corr1))

answer to question 1: -0.048530396034932394


In [323]:
ans_df = cities_nhl[['Population','Avg. Ratio']]
ans_df.drop_duplicates(inplace=True)

In [324]:
population_by_region = list(ans_df['Population'])
win_loss_by_region = list(ans_df['Avg. Ratio'])

In [325]:
stats.pearsonr(population_by_region, win_loss_by_region)[0]

0.004922112149349409

In [326]:
cities_nhl.to_csv('assets/cleaned_NFL.csv')