In [None]:
import pandas as pd
import numpy as np
# import scipy.stats as stats
import re

In [None]:
year = 2018

In [None]:
# the original data frame
nhl_df_org = pd.read_csv("utility_files/nhl.csv")
# retrieve only the cities from the wikipedia page
cities = pd.read_html("utility_files/wiki_data.html")[1]
# retrieve the data of interest
cities_org=cities.iloc[:-1,[0,3,5,6,7,8]]
print(cities.columns)

In [None]:
# as displayed in the previous cell, the columns' names are not practical
# data manipulation is needed.
cities = cities_org.rename(columns={'Metropolitan area':"area", "Population (2016 est.)[8]":"pop"})
# the cities df is a copy of manipulation and further study
# convert all column names to lower case and remove unnecessary spaces.
cities.columns = pd.Series(cities.columns).apply(lambda x: str(x).lower().strip())
print(cities.columns)

In [None]:
print(cities['nhl'])

# as we can see the teams' names require additional preprocessing to remove unnecessary characters

# cities_nhl is a dataframe that contain the information related solely to the nhl sport and cities

nhl_cols = ['area', 'pop', 'nhl']
cities_nhl = cities.copy().loc[:, nhl_cols]

In [None]:
def clean_team_name(name):
    # remove anything written between brackets [] 
    name_1 = re.sub('\[.*\]', "", name)
    # convert to lower case and remove indenting spaces
    return name_1.lower().strip()

cities_nhl['nhl'] = cities_nhl['nhl'].apply(clean_team_name)
# removing non-ascii characters
cities_nhl['nhl'] = cities_nhl['nhl'].apply(lambda x: re.sub("[^\x00-\xFF]", "", x)) 
# final cleaning step
cities_nhl['nhl'] = cities_nhl['nhl'].apply(lambda x: re.sub("[^(A-Z)(a-z)\d\s]", "", x))

# at this point cities with no nhl team are assigned the empty string in the "nhl" column
# keep the cities with nhl teams
cities_nhl = cities_nhl[cities_nhl['nhl'] != ''] 
print(cities_nhl)
# set the index to a numerical series from 0 to the size of the dataframe
custom_index = pd.Index(range(len(cities_nhl)))
cities_nhl = cities_nhl.set_index(custom_index)

## after indexing
print()
print("after indexing")
print("##" * 40)

print(cities_nhl)

In [None]:
# in order to map each team with its area, a new column should be added 
# that groups both the area/city name as well as the team's name

def area_team(row):
    area_no_space = re.sub("\s", "", row['area']).strip().lower()
    team_no_space = re.sub("\s", "", row['nhl']).strip().lower()
    return area_no_space + team_no_space

cities_nhl['area_team'] = cities_nhl.apply(area_team, axis=1)
print(cities_nhl.loc[:, ["area",  "nhl", "area_team"]])

In [None]:
# it is time to consider the nhl DataFrame
nhl_org = pd.read_csv("utility_files/nhl.csv")
nhl = nhl_org[nhl_org['year'] == year]
print(nhl.columns)

In [None]:
# among the columns we are only interested in the team, win (W) and loss (L) columns
cols = ["team", "W", "L"]
nhl = nhl.loc[:, cols]
print(nhl['team'])


In [None]:
# at first glance we can detect at least 2 main issues with the team column:
# 1. the need for reformatting the names
# 2. removing the rows declaring the teams' divisions

def clean_team_name_nhl(name):
    return re.sub("[^(A-z)(a-z)\d\s]", "", name).strip().lower()

# addressing problem 1
nhl['team'] = nhl['team'].apply(clean_team_name_nhl)

# addressing problem 2
nhl = nhl[~nhl['team'].str.contains("division")]

# setting a custom index
nhl_custom_index = pd.Index(range(len(nhl)))

nhl = nhl.set_index(nhl_custom_index)

print(nhl)

In [None]:
# time to add the area_team name column to the nhl DataFrame
nhl['area_team'] = nhl['team'].apply(lambda x: re.sub("\s","", x).strip().lower())
print(nhl)

The objective is to map every team in the nhl dataframe to an area/city in the cities_nhl dataframe. The first step is to merge the two dataframes on the area_team column assigning most of the teams. The rest should be processed separately.

In [None]:
# having the area_team column  in common between the two DataFrames we can merge them

merge_areas = pd.merge(cities_nhl, nhl, how ='left',on=['area_team'])
print(merge_areas.loc[:, ["area", "team"]])

In [None]:
merge_teams = pd.merge(nhl, cities_nhl, how='left', on=['area_team'])
print(merge_teams.loc[:, ["team", "area","area_team"]])

In [None]:
teams_no_clear_area = merge_teams[merge_teams['area'].isna()]
print(teams_no_clear_area)

In [None]:
areas_no_clear_team = merge_areas[merge_areas["team"].isna()]

In [None]:
# the teams left out with no clear area name are to be processed manually
# first let's consider the possibility of a mapping between the column [area_team] in the nhl DF
# and the column [area_team] in the nhl_cities DF

area_team_no_match_nhl_DF = teams_no_clear_area.set_index(pd.Index(range(len(teams_no_clear_area))))['area_team']
area_team_no_match_cities_nhl_DF = areas_no_clear_team.set_index(pd.Index(range(len(areas_no_clear_team))))['area_team']

no_match = pd.DataFrame({"team_no_area": area_team_no_match_nhl_DF, "area_no_name": area_team_no_match_cities_nhl_DF})
print(no_match)

In [None]:
# the last dataframe made it easy to see the mapping betweent the two columns
# the following dictionary reflects the mapping

mapping = {"newyorkcityrangersislandersdevils": "newyorkislanders", "losangeleskingsducks": "losangeleskings", "dallas–fortworthstars":"dallasstars"
, 'washington,d.c.capitals': "washingtoncapitals", "minneapolis–saintpaulwild": "minnesotawild", "denveravalanche": "coloradoavalanche"
, "miami–fortlauderdalepanthers": "floridapanthers", "tampabayarealightning": "tampabaylightning", "st.louisblues": "stlouisblues"
, "lasvegasgoldenknights": "vegasgoldenknights", "phoenixcoyotes": "arizonacoyotes", "raleighhurricanes": "carolinahurricanes", "sanfranciscobayareasharks": "sanjosesharks"}

# the next step is to map the old area_team names in the cities_nhl DF to their respective mapped value  

cities_nhl['area_team'] = cities_nhl['area_team'].apply(lambda x: mapping[x].strip() if x in mapping else x)

merge_teams = pd.merge(nhl, cities_nhl, how='left', on=['area_team'])
print(merge_teams.loc[: , ['area', 'team', 'area_team']])

In [None]:
# as expected there are 3 teams with no associated area, the next stop now is to associate these teams with one
# of areas provided. This task requires more human understanding of the data and a little bit of research

print("LEFT OUT TEAMS")
print(merge_teams[merge_teams['area'].isna()]['team'])
print()
print("AREAS:")
print(cities_nhl['area'])

### Research Results
According to the following [link](https://en.wikipedia.org/wiki/New_Jersey_Devils), the ***new jersey devils*** can be assigned to the ***New York City***. The ***new york rangers*** are also assigned to the same area. As for ***anaheim ducks***, it belongs to the ***Los Angelos*** area according to the following links:
* [link1](https://en.wikipedia.org/wiki/Anaheim_Ducks#:~:text=Anaheim%20Ducks%20The%20Anaheim%20Ducks%20are%20a%20professional,and%20play%20their%20home%20games%20at%20Honda%20Center.)
* [link2](https://en.wikipedia.org/wiki/Anaheim,_California)

In [None]:
team_area = {"new jersey devils": "New York City", "new york rangers": "New York City", "anaheim ducks": "Los Angeles"}

def set_areas(row):
    if row['team'] in team_area:
        row['area'] = team_area[row['team']]
    return row
merge_teams = merge_teams.apply(set_areas, axis=1)

print(merge_teams.loc[: , ['area', 'team', 'area_team']])

## Final Step
As the dataframe is cleaned, merged and filled with the necessary values, we can proceed to the final step which is
compute the win-loss ration for each team, then group the team by the area and consider the correlation between the win-loss values and the area's population

In [None]:
final_df = merge_teams.loc[:, ['area', 'team', 'W', 'L', 'pop']]
final_df['win_loss_ratio'] = final_df['W'].astype(float) / (final_df['W'].astype(float) + final_df['L'].astype(float))
final_df = final_df.loc[:, ['area', 'win_loss_ratio', 'pop']]
# print(final_df)

final_df = final_df.set_index('area').astype(float).groupby('area').agg({"win_loss_ratio":'mean', 'pop':'mean'})
print(final_df)

In [None]:
print(final_df.corr())

In [23]:
final_df.to_csv("/home/ayhem18/Ayhem18/DEV/Data_science/Towards_Data_science/Programming_Tools/Pandas_Numpy/nhl_final.csv")