In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import re

nhl_df=pd.read_csv("assets/nhl.csv")
cities=pd.read_html("assets/wikipedia_data.html")[1]
cities=cities.iloc[:-1,[0,3,5,6,7,8]]
cities.head()

Unnamed: 0,Metropolitan area,Population (2016 est.)[8],NFL,MLB,NBA,NHL
0,New York City,20153634,GiantsJets[note 1],YankeesMets[note 2],KnicksNets,RangersIslandersDevils[note 3]
1,Los Angeles,13310447,RamsChargers[note 4],DodgersAngels,LakersClippers,KingsDucks
2,San Francisco Bay Area,6657982,49ersRaiders[note 6],GiantsAthletics,Warriors,Sharks[note 7]
3,Chicago,9512999,Bears[note 8],CubsWhite Sox,Bulls[note 9],Blackhawks
4,Dallas–Fort Worth,7233323,Cowboys,Rangers,Mavericks,Stars


In [2]:
sports = ['Metropolitan area', 'Population', 'NFL','MLB','NBA','NHL']
cities.columns = sports 
def cleaning(item):
    #s = re.sub('\[.*\]', '', item)
    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', item)
    return s

def cleaning_2(item):
    s = re.sub('\[.*\]', '', item)
    #s = re.sub(r'([a-z])([A-Z])', r'\1 \2', item)
    return s

def cleaning_3(item):
    s = re.sub('\*$', '', item)
    return s


def apply_cleaning(clen_fun, dataframe, columns):
    for fun in clen_fun:
        for col in columns:
            dataframe[col]  = dataframe[col].apply(fun)
    return dataframe

functions = [cleaning, cleaning_2, cleaning_3]

apply_cleaning(functions, cities, sports)
cities.sort_values('Metropolitan area', axis = 0, inplace=True)
cities.reset_index(drop=True,inplace=True)
cities.head()

Unnamed: 0,Metropolitan area,Population,NFL,MLB,NBA,NHL
0,Atlanta,5789700,Falcons,Braves,Hawks,
1,Baltimore,2798886,Ravens,Orioles,,—
2,Boston,4794447,Patriots,Red Sox,Celtics,Bruins
3,Buffalo,1132804,Bills,,,Sabres
4,Calgary,1392609,—,—,—,Flames


In [3]:
city_replace = {'Dallas–Fort Worth':'Dallas','Miami–Fort Lauderdale':'Miami','Minneapolis–Saint Paul':'Minneapolis','New York City':'New York','San Francisco Bay Area':'San Francisco','Tampa Bay Area':'Tampa Bay','Washington, D.C.':'Washington','':'None','—':'None','— ':'None'}
cities.replace(to_replace=city_replace,value = None, inplace=True)


In [4]:
nhl_df = nhl_df[nhl_df['year'] == 2018]
nhl_df.drop([0,9,18,26], axis = 0, inplace= True)
nhl_df = nhl_df[['team','W','L',]]
replace_nhl = {'Toronto Maple':'Toronto','Leafs':'Maple Leafs','Detroit Red':'Detroit','Wings':'Red Wings','Columbus Blue':'Columbus','Jackets':'Blue Jackets','Vegas Golden':'Las Vegas','Knights':'Golden Knights','Florida':'Miami','San Jose':'San Francisco','Colorado':'Denver','Minnesota':'Minneapolis','Arizona':'Phoenix','Carolina':'Raleigh','New Jersey':'New York','Anaheim':'Los Angeles'}

In [5]:
apply_cleaning(functions, nhl_df, list(nhl_df.columns))
nhl_df.head()

Unnamed: 0,team,W,L
1,Tampa Bay Lightning,54,23
2,Boston Bruins,50,20
3,Toronto Maple Leafs,49,26
4,Florida Panthers,44,30
5,Detroit Red Wings,30,39


In [6]:
def splitname(row):
    row['Team']=row['team'].split(" ")[-1]
    s=row['team'].split(" ")[:-1]
    row['City']=' '.join(map(str, s)) 
    return row

nhl_df=nhl_df.apply(splitname, axis='columns')
del(nhl_df['team'])
nhl_df.sort_values('City', axis=0, inplace=True)
nhl_df.replace(to_replace=replace_nhl,value=None,inplace=True)
nhl_df = nhl_df.set_index('City')
nhl_df.head()

Unnamed: 0_level_0,W,L,Team
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Los Angeles,44,25,Ducks
Phoenix,29,41,Coyotes
Boston,50,20,Bruins
Buffalo,25,45,Sabres
Calgary,37,35,Flames


In [7]:
cities_nhl = cities[cities['NHL'] != 'None']
cities_nhl = cities_nhl[['Metropolitan area','Population','NHL']]
cities_nhl = cities_nhl.set_index('Metropolitan area')
cities_nhl.index.rename('City', inplace=True)
print('nr of cities: {}, nr of NHL teams: {}'.format(cities_nhl.shape[0],nhl_df.shape[0]))

nr of cities: 28, nr of NHL teams: 31


In [8]:
#cities_nhl[~cities_nhl['Metropolitan area'].isin(nhl_df['City'])] 
#nhl_df[~nhl_df.index.isin(cities_nhl.index)]
#Check what cities are not matching the names

In [9]:
cities_nhl = cities_nhl.merge(nhl_df, left_index=True, right_index=True)
cities_nhl.drop(labels='NHL', axis = 1, inplace=True)
cities_nhl['W/L Ratio'] = cities_nhl['W'].astype('float64')/(cities_nhl['L'].astype('float64') + cities_nhl['W'].astype('float64'))
cities_nhl.head()

Unnamed: 0_level_0,Population,W,L,Team,W/L Ratio
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boston,4794447,50,20,Bruins,0.714286
Buffalo,1132804,25,45,Sabres,0.357143
Calgary,1392609,37,35,Flames,0.513889
Chicago,9512999,33,39,Blackhawks,0.458333
Columbus,2041520,45,30,Blue Jackets,0.6


In [10]:
cities_nhl['Avg. Ratio'] = cities_nhl.groupby(cities_nhl.index).agg(np.mean)
cities_nhl['Population'] = cities_nhl['Population'].astype('int64')
corr1 = cities_nhl['Population'].corr(cities_nhl['Avg. Ratio'],method='pearson')
print('answer to question 1: {}'.format(corr1))

answer to question 1: -0.04358264468600224


In [11]:
ans_df = cities_nhl[['Population','Avg. Ratio']]
ans_df.drop_duplicates(inplace=True)

In [12]:
population_by_region = list(ans_df['Population'])
win_loss_by_region = list(ans_df['Avg. Ratio'])

In [13]:
stats.pearsonr(population_by_region, win_loss_by_region)[0]

0.012486162921209923