# Analysis of NHL Data
 ## *Question: Is there a correlation between the population of the city the team is from and how many games they win?*

In [1]:
# Import usual libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

#Read and clean the city data

nhl_df=pd.read_csv("nhl.csv")

cities=pd.read_html('wikipedia_data.html')[1]
cities=cities.iloc[:-1,[0,3,5,6,7,8]]

cities = cities.drop(['NFL','MLB','NBA'], axis = 1)

cities.replace('(\[)(\w)*(\s)(\d)*(\])', '', regex = True, inplace = True)

cities.replace('—', np.nan, inplace = True)

cities.replace('', np.nan, inplace = True)

cities=cities.dropna()
cities = cities.rename(columns = {'NHL':'team','Population (2016 est.)[8]':'Population' })

In [2]:
#Read and clean the NHL data

nhl_df_mask =nhl_df['year'] == 2018
nhl_df =nhl_df.where(nhl_df_mask).dropna()
nhl_df = nhl_df[['team','W','L']]
nhl_df.drop([0,9,18,26], inplace = True)
nhl_df.replace('(\*)','', regex = True, inplace = True)
nhl_df.replace('((\w)*(\s))+', '', regex = True, inplace = True)

In [6]:
#Adding in some missing data 
df = pd.DataFrame({'Metropolitan area':['New York City','New York City'],'Population':['20153634','20153634'],'team':['Islanders','Devils']})

cities = cities.append(df, ignore_index = True)

cities.iloc[1]['team'] = 'Kings'
cities.iloc[0]['team'] = 'Rangers'

cities.loc[len(cities.index)] = ['Los Angeles','13310447','Ducks']

In [8]:
# Merge data sets
nhldata = pd.merge(cities, nhl_df, on = 'team')

nhldata = nhldata.set_index('Metropolitan area')

nhldata['W'] = nhldata['W'].astype(int)
nhldata['L'] = nhldata['L'].astype(int)
nhldata['Population'] = nhldata['Population'].astype(int)

nhldata['games'] = nhldata['W'] + nhldata['L']
nhldata['win_ratio'] = nhldata['W']/nhldata['games'] 

nhldata['win_ratio/population'] = nhldata['win_ratio']/nhldata['Population']

nhldata.sort_values(by =['win_ratio/population'], ascending = False, inplace = True)

<!-- Is there a significant corrolation between population and win ratio -->

In [11]:
stats.pearsonr(nhldata['Population'], nhldata['win_ratio'])

(-0.019431769879490496, 0.9249355293652931)

### Correlation between population and win ratio is *r* = -0.02 with a *p-value* of 0.92, indicating that there is no correlation in win ratio and population in NHL teams in 2018. 