in this Notebook, we want to manipulate and study some datasets related to sports like `mlb.csv` (including Major League Baseball Teams), `nba.csv` (including National Basketball America Teams), `nhl.csv` (including National Hockey League), `nfl.csv` (including National Football League).

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

To start the manipulating process, we should first load the informative information of `wikipedia_data.html`.

In [4]:
cities_df = pd.read_html('datasets/wikipedia_data.html')[1] 
cities_df= cities_df.iloc[:-1, [0, 3, 5, 6, 7, 8]]
cities_df = (cities_df.replace('\[.*\]', '', regex= True)
                      .replace('—', np.nan, regex= True))
cities_df.columns = ['Metropolitan area', 'Population',
                     'NFL', 'MLB', 'NBA', 'NHL']

<br> </br>

In [33]:
nhl_df.dtypes

team      object
GP        object
W         object
L         object
OL        object
PTS       object
PTS%      object
GF        object
GA        object
SRS       object
SOS       object
RPt%      object
ROW       object
year       int64
League    object
dtype: object

As you can see, we have to use the pd.to_numeric() function to convert the type of columns to a scalar value. because we can't apply any mathematical operation on `object` data type.

In [8]:
def win_loss_ratio(record):
    record['Win/Loss'] = pd.to_numeric(record['W']) / (pd.to_numeric(record['W']) + pd.to_numeric(record['L']))
    return record

In [9]:
def team_cities(record):
    if record['team'] in cityDict:
        record['Metropolitan area'] = cityDict[record['team']]
    
    return record

In [10]:
def calc_correlation(v1: list, v2: list):
    return stats.pearsonr(v1, v2)[0]

<br> </br>
In the following cell, we calculate the win/loss ratio's correlation with the population of the cities that are in for **NHL** using **2018** data.

In [34]:
cities_nhl = (cities_df[(cities_df['NHL'].isna() == False) & (cities_df['NHL'] != '')]
                                                                    .reset_index()
                                                                    .drop('index', axis =1)
                                                                    .iloc[:, [0, 1, 5]])
                                                      
cities_nhl.set_index('Metropolitan area', inplace= True)

nhl_df = pd.read_csv('datasets/nhl.csv')
nhl_df = (nhl_df[nhl_df['year'] == 2018]
                                    .drop([0, 9, 18, 26])
                                    .reset_index()
                                    .drop('index', axis= 'columns'))
                                    

nhl_df['team'] = nhl_df['team'].replace('\*', '', regex= True)

cityDict = {"Tampa Bay Lightning" : "Tampa Bay Area",
            "Boston Bruins" : 'Boston',
            "Toronto Maple Leafs" : 'Toronto',
            "Florida Panthers" : 'Miami–Fort Lauderdale',
            "Detroit Red Wings" : 'Detroit',
            "Montreal Canadiens" : 'Montreal',
            "Ottawa Senators" : 'Ottawa',
            "Buffalo Sabres" : 'Buffalo',
            "Washington Capitals" : 'Washington, D.C.',
            "Pittsburgh Penguins" : 'Pittsburgh',
            "Philadelphia Flyers" : 'Philadelphia',
            "Columbus Blue Jackets" : 'Columbus',
            "New Jersey Devils" : 'New York City',
            "Carolina Hurricanes" : 'Raleigh',
            "New York Islanders" : 'New York City',
            "New York Rangers" : 'New York City',
            "Nashville Predators" : 'Nashville',
            "Winnipeg Jets" : 'Winnipeg',
            "Minnesota Wild" : 'Minneapolis–Saint Paul',
            "Colorado Avalanche" : 'Denver',
            "St. Louis Blues" : 'St. Louis',
            "Dallas Stars" : 'Dallas–Fort Worth',
            "Chicago Blackhawks" : 'Chicago',
            "Vegas Golden Knights" : 'Las Vegas',
            "Anaheim Ducks" : 'Los Angeles',
            "San Jose Sharks" : 'San Francisco Bay Area',
            "Los Angeles Kings" : 'Los Angeles',
            "Calgary Flames" : 'Calgary',
            "Edmonton Oilers" : 'Edmonton',
            'Vancouver Canucks' : 'Vancouver',
            'Arizona Coyotes' : 'Phoenix',
            }

nhl_df = nhl_df.apply(lambda x: win_loss_ratio(x), axis= 1)
nhl_df = nhl_df.apply(lambda x: team_cities(x), axis= 1)
nhl_df = nhl_df.groupby('Metropolitan area').agg({'Win/Loss' : np.average})

merged_nhl = cities_nhl.merge(right= nhl_df, how= 'outer', on= 'Metropolitan area')
merged_nhl.sort_values('Win/Loss', ascending= False, inplace= True)

# pass in metropolitan area population from cities_nhl to calc_correlation()
population_by_region= list(merged_nhl.iloc[:, 0].astype(np.float64))
# pass in win/loss ratio from nhl_df to calc_correlation()
win_loss_by_region = list(merged_nhl['Win/Loss'])

calc_correlation(population_by_region, win_loss_by_region)

0.012486162921209902

In [7]:
type(calc_correlation(population_by_region, win_loss_by_region))

numpy.float64

In [8]:
merged_nhl.shape

(28, 3)

In [9]:
merged_nhl

Unnamed: 0_level_0,Population,NHL,Win/Loss
Metropolitan area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nashville,1865298,Predators,0.746479
Winnipeg,778489,Jets,0.722222
Boston,4794447,Bruins,0.714286
Tampa Bay Area,3032171,Lightning,0.701299
Las Vegas,2155664,Golden Knights,0.68
Toronto,5928040,Maple Leafs,0.653333
"Washington, D.C.",6131977,Capitals,0.653333
Minneapolis–Saint Paul,3551036,Wild,0.633803
San Francisco Bay Area,6657982,Sharks,0.625
Los Angeles,13310447,KingsDucks,0.622895


<br> </br>
In the following cell, we calculate the win/loss ratio's correlation with the population of the cities that are in for **NBA** using **2018** data.

In [35]:
cities_nba = (cities_df[(cities_df['NBA'].isna() == False) & (cities_df['NBA'] != '')]
                                                                                .reset_index()
                                                                                .drop('index', axis= 1)
                                                                                .iloc[:, [0, 1, 4]])
cities_nba.set_index('Metropolitan area', inplace= True)
                                                                               
nba_df = pd.read_csv('datasets/nba.csv')

nba_df['team'] = (nba_df['team'].replace('\s\(\d{1,2}\)', '', regex= True)
                                .replace('\*', '', regex= True))

nba_df = nba_df[nba_df['year'] == 2018][['team', 'W', 'L']]

cityDict = {'Toronto Raptors' : 'Toronto',
            'Boston Celtics' : 'Boston',
            'Philadelphia 76ers' : 'Philadelphia',
            'Cleveland Cavaliers' : 'Cleveland',
            'Indiana Pacers' : 'Indianapolis',
            'Miami Heat' : 'Miami–Fort Lauderdale',
            'Milwaukee Bucks' : 'Milwaukee',
            'Washington Wizards' : 'Washington, D.C.',
            'Detroit Pistons' : 'Detroit',
            'Charlotte Hornets' : 'Charlotte',
            'New York Knicks' : 'New York City',
            'Brooklyn Nets' : 'New York City',
            'Chicago Bulls' : 'Chicago',
            'Orlando Magic' : 'Orlando',
            'Atlanta Hawks' : 'Atlanta',
            'Houston Rockets' : 'Houston',
            'Golden State Warriors' : 'San Francisco Bay Area',
            'Portland Trail Blazers' : 'Portland',
            'Oklahoma City Thunder' : 'Oklahoma City',
            'Utah Jazz' : 'Salt Lake City',
            'New Orleans Pelicans' : 'New Orleans',
            'San Antonio Spurs' : 'San Antonio',
            'Minnesota Timberwolves' : 'Minneapolis–Saint Paul',
            'Denver Nuggets' : 'Denver',
            'Los Angeles Clippers' : 'Los Angeles',
            'Los Angeles Lakers' : 'Los Angeles',
            'Sacramento Kings' : 'Sacramento',
            'Dallas Mavericks' : 'Dallas–Fort Worth',
            'Memphis Grizzlies' : 'Memphis',
            'Phoenix Suns' : 'Phoenix'}

nba_df = nba_df.apply(lambda x: win_loss_ratio(x), axis= 1)
nba_df = nba_df.apply(lambda x: team_cities(x), axis= 1)
nba_df = nba_df.groupby('Metropolitan area').agg({'Win/Loss' : np.average})

merged_nba = cities_nba.merge(right= nba_df, how= 'outer', on= 'Metropolitan area')
merged_nba.sort_values('Win/Loss', ascending= False, inplace= True)

# pass in metropolitan area population from cities_nba to calc_correlation()
population_by_region= list(merged_nba.iloc[:, 0].astype(np.float64))
# pass in win/loss ratio from nba_df to calc_correlation()
win_loss_by_region = list(merged_nba['Win/Loss'])

calc_correlation(population_by_region, win_loss_by_region)

-0.1765716025284462

In [11]:
merged_nba.shape

(28, 3)

In [12]:
merged_nba

Unnamed: 0_level_0,Population,NBA,Win/Loss
Metropolitan area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Houston,6772470,Rockets,0.792683
Toronto,5928040,Raptors,0.719512
San Francisco Bay Area,6657982,Warriors,0.707317
Boston,4794447,Celtics,0.670732
Philadelphia,6070500,76ers,0.634146
Cleveland,2055612,Cavaliers,0.609756
Portland,2424955,Trail Blazers,0.597561
Oklahoma City,1373211,Thunder,0.585366
Indianapolis,2004230,Pacers,0.585366
New Orleans,1268883,Pelicans,0.585366


<br> </br>
In the following cell, we calculate the win/loss ratio's correlation with the population of the cities that are in for **MLB** using **2018** data.


In [36]:
cities_mlb = (cities_df[(cities_df['MLB'].isna() == False) & (cities_df['MLB'] != '')]
                                                                                .reset_index()
                                                                                .drop('index', axis= 1)
                                                                                .iloc[:, [0, 1, 3]])
cities_mlb.set_index('Metropolitan area', inplace= True)
                                                                               
mlb_df = pd.read_csv('datasets/mlb.csv')
mlb_df = mlb_df[mlb_df['year'] == 2018][['team', 'W', 'L']]

cityDict = {'Boston Red Sox' : 'Boston',
            'New York Yankees' : 'New York City',
            'Tampa Bay Rays' : 'Tampa Bay Area',
            'Toronto Blue Jays' : 'Toronto',
            'Baltimore Orioles' : 'Baltimore',
            'Cleveland Indians' : 'Cleveland',
            'Minnesota Twins' : 'Minneapolis–Saint Paul',
            'Detroit Tigers' : 'Detroit',
            'Chicago White Sox' : 'Chicago',
            'Kansas City Royals' : 'Kansas City',
            'Houston Astros' : 'Houston',
            'Oakland Athletics' : 'San Francisco Bay Area',
            'Seattle Mariners' : 'Seattle',
            'Los Angeles Angels' : 'Los Angeles',
            'Texas Rangers' : 'Dallas–Fort Worth',
            'Atlanta Braves' : 'Atlanta',
            'Washington Nationals' : 'Washington, D.C.',
            'Philadelphia Phillies' : 'Philadelphia',
            'New York Mets' : 'New York City',
            'Miami Marlins' : 'Miami–Fort Lauderdale',
            'Milwaukee Brewers' : 'Milwaukee',
            'Chicago Cubs' : 'Chicago',
            'St. Louis Cardinals' : 'St. Louis',
            'Pittsburgh Pirates' : 'Pittsburgh',
            'Cincinnati Reds' : 'Cincinnati',
            'Los Angeles Dodgers' : 'Los Angeles',
            'Colorado Rockies' : 'Denver',
            'Arizona Diamondbacks' : 'Phoenix',
            'San Francisco Giants' : 'San Francisco Bay Area',
            'San Diego Padres' : 'San Diego'}

mlb_df = mlb_df.apply(lambda x: win_loss_ratio(x), axis= 1)
mlb_df = mlb_df.apply(lambda x: team_cities(x), axis= 1)
mlb_df = mlb_df.groupby('Metropolitan area').agg({'Win/Loss' : np.average})

merged_mlb = cities_mlb.merge(right= mlb_df, how= 'outer', on= 'Metropolitan area')
merged_mlb.sort_values('Win/Loss', ascending= False, inplace= True)

# pass in metropolitan area population from cities_mlb to calc_correlation()
population_by_region= list(merged_mlb.iloc[:, 0].astype(np.float64))
# pass in win/loss ratio from mlb_df to calc_correlation()
win_loss_by_region = list(merged_mlb['Win/Loss'])

calc_correlation(population_by_region, win_loss_by_region)

0.15027698302669307

In [14]:
merged_mlb.shape

(26, 3)

In [37]:
merged_mlb

Unnamed: 0_level_0,Population,MLB,Win/Loss
Metropolitan area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Boston,4794447,Red Sox,0.666667
Houston,6772470,Astros,0.635802
Milwaukee,1572482,Brewers,0.588957
Cleveland,2055612,Indians,0.561728
Denver,2853077,Rockies,0.558282
Tampa Bay Area,3032171,Rays,0.555556
Atlanta,5789700,Braves,0.555556
Seattle,3798902,Mariners,0.549383
New York City,20153634,Yankees Mets,0.546296
St. Louis,2807002,Cardinals,0.54321


<br> </br>
In the following cell, we calculate the win/loss ratio's correlation with the population of the cities that are in for **NFL**  using **2018** data.

In [38]:
cities_nfl = cities_df[(cities_df['NFL'].isna() == False) & (cities_df['NFL'] != '')]

cities_nfl = cities_nfl.set_index('Metropolitan area')[['Population', 'NFL']]

nfl_df = pd.read_csv('datasets/nfl.csv')

nfl_df = nfl_df[nfl_df['year'] == 2018][['team', 'W', 'L']]
nfl_df['team'] = nfl_df['team'].replace('\*|\+', '', regex= True)
nfl_df = (nfl_df.drop([0, 5, 10, 15, 20, 25, 30, 35])
               .reset_index()
               .drop('index', axis= 1))

cityDict= {'New England Patriots' : 'Boston',
        'Miami Dolphins' : 'Miami–Fort Lauderdale',
        'Buffalo Bills' : 'Buffalo',
        'New York Jets' : 'New York City',
        'Baltimore Ravens' : 'Baltimore',
        'Pittsburgh Steelers' : 'Pittsburgh',
        'Cleveland Browns' : 'Cleveland',
        'Cincinnati Bengals' : 'Cincinnati',
        'Houston Texans' : 'Houston',
        'Indianapolis Colts' : 'Indianapolis',
        'Tennessee Titans' : 'Nashville',
        'Jacksonville Jaguars' : 'Jacksonville',
        'Kansas City Chiefs' : 'Kansas City',
        'Los Angeles Chargers' : 'Los Angeles',
        'Denver Broncos' : 'Denver',
        'Oakland Raiders' : 'San Francisco Bay Area',
        'Dallas Cowboys' : 'Dallas–Fort Worth',
        'Philadelphia Eagles' : 'Philadelphia',
        'Washington Redskins' : 'Washington, D.C.',
        'New York Giants' : 'New York City',
        'Chicago Bears' : 'Chicago',
        'Minnesota Vikings' : 'Minneapolis–Saint Paul',
        'Green Bay Packers' : 'Green Bay',
        'Detroit Lions' : 'Detroit',
        'New Orleans Saints' : 'New Orleans',
        'Carolina Panthers' : 'Charlotte',
        'Atlanta Falcons' : 'Atlanta',
        'Tampa Bay Buccaneers' : 'Tampa Bay Area',
        'Los Angeles Rams' : 'Los Angeles',
        'Seattle Seahawks' : 'Seattle',
        'San Francisco 49ers' : 'San Francisco Bay Area',
        'Arizona Cardinals' : 'Phoenix'}

nfl_df = nfl_df.apply(lambda x: win_loss_ratio(x), axis= 1)
nfl_df = nfl_df.apply(lambda x: team_cities(x), axis= 'columns')
nfl_df = nfl_df.groupby('Metropolitan area').agg({'Win/Loss' : np.average})

merged_nfl = pd.merge(cities_nfl, nfl_df, on= "Metropolitan area", how= 'outer')
merged_nfl.sort_values('Win/Loss', ascending= False, inplace= True)

# pass in metropolitan area population from cities_nfl to calc_correlation()
population_by_region = list(merged_nfl['Population'].astype(np.float64))
# pass in win/loss ratio from nfl_df to calc_correlation()
win_loss_by_region = list(merged_nfl['Win/Loss'])

calc_correlation(population_by_region, win_loss_by_region)

0.0049221121493494314

In [17]:
merged_nfl.shape

(29, 3)

In [18]:
merged_nfl

Unnamed: 0_level_0,Population,NFL,Win/Loss
Metropolitan area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
New Orleans,1268883,Saints,0.8125
Los Angeles,13310447,RamsChargers,0.78125
Chicago,9512999,Bears,0.75
Kansas City,2104509,Chiefs,0.75
Boston,4794447,Patriots,0.6875
Houston,6772470,Texans,0.6875
Seattle,3798902,Seahawks,0.625
Indianapolis,2004230,Colts,0.625
Dallas–Fort Worth,7233323,Cowboys,0.625
Baltimore,2798886,Ravens,0.625


<br> </br>

In the following cell, we want to discover if there is a statistically significant difference between datasets (using [`ttest_rel`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html)) that have a same index which shows areas where have two sports teams in different sport. In a region, if a sport(like NBA or so forth) has multiple teams, we should average between the ones of that sport.

In [41]:
from scipy.stats import ttest_rel

In [42]:
def test_columns():
    # we compare two following datasets (df1 and df2) based on the Win/Loss feaure.
    return ttest_rel(a= df1['Win/Loss'].astype(np.float64), b= df2['Win/Loss'].astype(np.float64))[1]

nhl_nba_li = [nhl_area for nba_area in merged_nba.index for nhl_area in merged_nhl.index if nhl_area == nba_area]
df1 = merged_nhl.loc[nhl_nba_li].drop('NHL', axis= 1)
df2 = merged_nba.loc[nhl_nba_li].drop('NBA', axis= 1)
pval_nhl_nba = test_columns()

nhl_mlb_li = [nhl_area for mlb_area in merged_mlb.index for nhl_area in merged_nhl.index if nhl_area == mlb_area]
df1 = merged_nhl.loc[nhl_mlb_li].drop('NHL', axis= 1)
df2 = merged_mlb.loc[nhl_mlb_li].drop('MLB', axis= 1)
pval_nhl_mlb = test_columns()

nhl_nfl_li = [nhl_area for nfl_area in merged_nfl.index for nhl_area in merged_nhl.index if nhl_area == nfl_area]
df1 = merged_nhl.loc[nhl_nfl_li].drop('NHL', axis= 1)
df2 = merged_nfl.loc[nhl_nfl_li].drop('NFL', axis= 1)
pval_nhl_nfl = test_columns()

nba_mlb_li = [nba_area for mlb_area in merged_mlb.index for nba_area in merged_nba.index if nba_area == mlb_area]
df1 = merged_nba.loc[nba_mlb_li].drop('NBA', axis= 1)
df2 = merged_mlb.loc[nba_mlb_li].drop('MLB', axis= 1)
pval_nba_mlb = test_columns()

nba_nfl_li = [nba_area for nfl_area in merged_nfl.index for nba_area in merged_nba.index if nba_area == nfl_area]
df1 = merged_nba.loc[nba_nfl_li].drop('NBA', axis= 1)
df2 = merged_nfl.loc[nba_nfl_li].drop('NFL', axis= 1)
pval_nba_nfl = test_columns()

mlb_nfl_li = [mlb_area for nfl_area in merged_nfl.index for mlb_area in merged_mlb.index if mlb_area == nfl_area]
df1 = merged_mlb.loc[mlb_nfl_li].drop('MLB', axis= 1)
df2 = merged_nfl.loc[mlb_nfl_li].drop('NFL', axis= 1)
pval_mlb_nfl = test_columns()


# Note: p_values is a full dataframe, so df.loc["NFL","NBA"] should be the same as df.loc["NBA","NFL"] and
# df.loc["NFL","NFL"] should return np.nan.    
pvalDict = {'NFL' : {'NFL' : np.nan, 'NBA' : pval_nba_nfl, 'NHL' : pval_nhl_nfl, 'MLB' : pval_mlb_nfl},
           'NBA' : {'NFL' : pval_nba_nfl, 'NBA' : np.nan, 'NHL' : pval_nhl_nba, 'MLB' : pval_nba_mlb},
           'NHL' : {'NFL' : pval_nhl_nfl, 'NBA' : pval_nhl_nba, 'NHL' : np.nan, 'MLB' : pval_nhl_mlb},
           'MLB' : {'NFL' : pval_mlb_nfl, 'NBA' : pval_nba_mlb, 'NHL' : pval_nhl_mlb, 'MLB' : np.nan}
           }

p_values = pd.DataFrame(pvalDict)

In [21]:
p_values

Unnamed: 0,NFL,NBA,NHL,MLB
NFL,,0.941792,0.030883,0.802069
NBA,0.941792,,0.022297,0.95054
NHL,0.030883,0.022297,,0.000708
MLB,0.802069,0.95054,0.000708,
