In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from bs4 import BeautifulSoup

# Question 1

In [2]:
# loading NHL csv to a Dataframe
nhl_df = pd.read_csv("datasets/nhl.csv")

# using PDs read_html to retrieve all tables. the [1] slicing cleanly retrieves the 2nd table out of ALL the parsed results
cities = pd.read_html("datasets/List of American and Canadian cities by number of major professional sports franchises - Wikipedia.html")[1]

# filtering data per year. we only want 2018
nhl_df2 = nhl_df[nhl_df["year"] == 2018]

# deleting delinquent rows...
nhl_df2.drop([0,9,18,26], axis=0, inplace=True)

# changing columns to correct dtype
nhl_df2 = nhl_df2.astype({"W":"int32","L":"int32"})

# calculating win/loss ratio
    ## defining function
def w_l_ratio(frame):
    frame["W/L Ratio"] = frame["W"] / (frame["W"]+frame["L"])
    
    return frame
    
    ## applying
nhl_df2 = nhl_df2.apply(w_l_ratio, axis=1)
nhl_df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,team,GP,W,L,OL,PTS,PTS%,GF,GA,SRS,SOS,RPt%,ROW,year,League,W/L Ratio
1,Tampa Bay Lightning*,82,54,23,5,113,0.689,296,236,0.66,-0.07,0.634,48,2018,NHL,0.701299
2,Boston Bruins*,82,50,20,12,112,0.683,270,214,0.62,-0.07,0.61,47,2018,NHL,0.714286
3,Toronto Maple Leafs*,82,49,26,7,105,0.64,277,232,0.49,-0.06,0.567,42,2018,NHL,0.653333
4,Florida Panthers,82,44,30,8,96,0.585,248,246,-0.01,-0.04,0.537,41,2018,NHL,0.594595
5,Detroit Red Wings,82,30,39,13,73,0.445,217,255,-0.48,-0.01,0.341,25,2018,NHL,0.434783
6,Montreal Canadiens,82,29,40,13,71,0.433,209,264,-0.68,0.0,0.378,27,2018,NHL,0.42029
7,Ottawa Senators,82,28,43,11,67,0.409,221,291,-0.85,0.0,0.372,26,2018,NHL,0.394366
8,Buffalo Sabres,82,25,45,12,62,0.378,199,280,-0.98,0.01,0.311,24,2018,NHL,0.357143
10,Washington Capitals*,82,49,26,7,105,0.64,259,239,0.21,-0.04,0.585,46,2018,NHL,0.653333
11,Pittsburgh Penguins*,82,47,29,6,100,0.61,272,250,0.23,-0.04,0.573,45,2018,NHL,0.618421


In [3]:
# creating mapping between teams and metropolitan areas
lista = list(nhl_df2["team"])
team_to_area = {item:"0" for item in lista}
team_to_area['Tampa Bay Lightning*'] = 'Tampa Bay Area'
team_to_area['Boston Bruins*'] = 'Boston'
team_to_area['Toronto Maple Leafs*'] = 'Toronto'
team_to_area['Florida Panthers'] = 'Miami–Fort Lauderdale'
team_to_area['Detroit Red Wings'] = 'Detroit'
team_to_area['Montreal Canadiens'] = 'Montreal'
team_to_area['Ottawa Senators'] = "Ottawa"
team_to_area['Buffalo Sabres'] = "Buffalo"
team_to_area['Washington Capitals*'] = "Washington, D.C."
team_to_area['Pittsburgh Penguins*'] = "Pittsburgh"
team_to_area['Philadelphia Flyers*'] = "Philadelphia"
team_to_area['Columbus Blue Jackets*'] = "Columbus"
team_to_area['New Jersey Devils*'] = "New York City"
team_to_area['Carolina Hurricanes'] = "Raleigh"
team_to_area['New York Islanders'] = "New York City"
team_to_area['Nashville Predators*'] = "Nashville"
team_to_area['New York Rangers'] = "New York City"
team_to_area['Winnipeg Jets*'] = "Winnipeg"
team_to_area['Minnesota Wild*'] = "Minneapolis–Saint Paul"
team_to_area['Colorado Avalanche*'] = "Denver"
team_to_area['St. Louis Blues'] = "St. Louis"
team_to_area['Dallas Stars'] = "Dallas–Fort Worth"
team_to_area['Chicago Blackhawks'] = "Chicago"
team_to_area['Vegas Golden Knights*'] = "Las Vegas"
team_to_area['Anaheim Ducks*'] = "Los Angeles"
team_to_area['San Jose Sharks*'] = "San Francisco Bay Area"
team_to_area['Los Angeles Kings*'] = "Los Angeles"
team_to_area['Calgary Flames'] = "Calgary"
team_to_area['Edmonton Oilers'] = "Edmonton"
team_to_area['Vancouver Canucks'] = "Vancouver"
team_to_area['Arizona Coyotes'] = "Phoenix"

# creating a new column for metropolitan area
nhl_df2["Metropolitan area"] = nhl_df2["team"].apply(lambda x: team_to_area[x] if x in team_to_area else np.nan)
nhl_df2

Unnamed: 0,team,GP,W,L,OL,PTS,PTS%,GF,GA,SRS,SOS,RPt%,ROW,year,League,W/L Ratio,Metropolitan area
1,Tampa Bay Lightning*,82,54,23,5,113,0.689,296,236,0.66,-0.07,0.634,48,2018,NHL,0.701299,Tampa Bay Area
2,Boston Bruins*,82,50,20,12,112,0.683,270,214,0.62,-0.07,0.61,47,2018,NHL,0.714286,Boston
3,Toronto Maple Leafs*,82,49,26,7,105,0.64,277,232,0.49,-0.06,0.567,42,2018,NHL,0.653333,Toronto
4,Florida Panthers,82,44,30,8,96,0.585,248,246,-0.01,-0.04,0.537,41,2018,NHL,0.594595,Miami–Fort Lauderdale
5,Detroit Red Wings,82,30,39,13,73,0.445,217,255,-0.48,-0.01,0.341,25,2018,NHL,0.434783,Detroit
6,Montreal Canadiens,82,29,40,13,71,0.433,209,264,-0.68,0.0,0.378,27,2018,NHL,0.42029,Montreal
7,Ottawa Senators,82,28,43,11,67,0.409,221,291,-0.85,0.0,0.372,26,2018,NHL,0.394366,Ottawa
8,Buffalo Sabres,82,25,45,12,62,0.378,199,280,-0.98,0.01,0.311,24,2018,NHL,0.357143,Buffalo
10,Washington Capitals*,82,49,26,7,105,0.64,259,239,0.21,-0.04,0.585,46,2018,NHL,0.653333,"Washington, D.C."
11,Pittsburgh Penguins*,82,47,29,6,100,0.61,272,250,0.23,-0.04,0.573,45,2018,NHL,0.618421,Pittsburgh


In [10]:
# creating relevant through an inner join.
t1 = pd.merge(cities, nhl_df2, how="inner", on="Metropolitan area").set_index("Metropolitan area")

# defining dummy function
def keep(val):
    if len(val) > 1:
        return val.iloc[0]
    return val


In [11]:
from scipy import stats
# fixing data type
t1 = t1.astype({"Population (2016 est.)[8]":"int64"})
t1 = t1.pivot_table(index="Metropolitan area", aggfunc={"W/L Ratio":np.average,"Population (2016 est.)[8]":keep})
results = stats.pearsonr(t1["Population (2016 est.)[8]"], t1["W/L Ratio"])
t1

Unnamed: 0_level_0,Population (2016 est.)[8],W/L Ratio
Metropolitan area,Unnamed: 1_level_1,Unnamed: 2_level_1
Boston,4794447,0.714286
Buffalo,1132804,0.357143
Calgary,1392609,0.513889
Chicago,9512999,0.458333
Columbus,2041520,0.6
Dallas–Fort Worth,7233323,0.567568
Denver,2853077,0.589041
Detroit,4297617,0.434783
Edmonton,1321426,0.473684
Las Vegas,2155664,0.68


# Question 2

# Question 3

# Question 4