## Overview
This notebook is part of a data analysis task (mini-project) to asnwer the following question:  

what is the win/loss ratio's correlation with the population of the city it is in? are these correlations different between the four major leagues ?
* NHL: Hockey league
* NBA: Basketball league
* MLB: Baseball League
* NFL: Football (Americal football) League
This notebook considerers the National Basketball League.

In [1]:
import pandas as pd
import numpy as np
import re
import os
import cleaning_data as c
year = 2018
from cleaning_data import cities_original_path, cities_clean_path, nbl_path, final_nbl_path

In [2]:
# cities_original_path = "utility_files/wiki_data.html"
# cities_clean_path = "utility_files/cities.csv" 
# nbl_path = "utility_files/nbl.csv"
# final_nbl_path = "nbl_final.csv"

In [3]:
# load the clean cities dataset
if not os.path.exists(cities_clean_path):
    c.clean_city_data(cities_original_path, cities_clean_path)


### Load and clean (the NBL-related parts) the cities dataset 

In [4]:
cities = pd.read_csv(cities_clean_path)
cities_nba = cities.loc[:, ['area', 'pop', 'nba']]

def clean_team_name(name):
    # remove anything written between brackets [] 
    name_1 = re.sub('\[.*\]', "", name)
    # convert to lower case and remove indenting spaces
    return name_1.lower().strip()

cities_nba['nba'] = cities_nba['nba'].apply(clean_team_name)
# removing non-ascii characters
cities_nba['nba'] = cities_nba['nba'].apply(lambda x: re.sub("[^\x00-\xFF]", "", x)) 
# final cleaning step
cities_nba['nba'] = cities_nba['nba'].apply(lambda x: re.sub("[^(A-Z)(a-z)\d\s]", "", x))

# at this point cities with no nba team are assigned the empty string in the "nba" column
# keep the cities with nba teams
cities_nba = cities_nba[cities_nba['nba'] != ''] 
print(cities_nba)
# set the index to a numerical series from 0 to the size of the dataframe
custom_index = pd.Index(range(len(cities_nba)))
cities_nba = cities_nba.set_index(custom_index)

## after indexing
print()
print("after indexing")
print("##" * 40)

print(cities_nba)

                      area       pop             nba
0            New York City  20153634      knicksnets
1              Los Angeles  13310447  lakersclippers
2   San Francisco Bay Area   6657982        warriors
3                  Chicago   9512999           bulls
4        Dallas–Fort Worth   7233323       mavericks
5         Washington, D.C.   6131977         wizards
6             Philadelphia   6070500           76ers
7                   Boston   4794447         celtics
8   Minneapolis–Saint Paul   3551036    timberwolves
9                   Denver   2853077         nuggets
10   Miami–Fort Lauderdale   6066387            heat
11                 Phoenix   4661537            suns
12                 Detroit   4297617         pistons
13                 Toronto   5928040         raptors
14                 Houston   6772470         rockets
15                 Atlanta   5789700           hawks
18               Cleveland   2055612       cavaliers
24               Charlotte   2474314         h

In [5]:
# in order to map each team with its area, a new column should be added 
# that groups both the area/city name as well as the team's name

def area_team(row):
    area_no_space = re.sub("\s", "", row['area']).strip().lower()
    team_no_space = re.sub("\s", "", row['nba']).strip().lower()
    return area_no_space + team_no_space

cities_nba['area_team'] = cities_nba.apply(area_team, axis=1)
print(cities_nba.loc[:, ["area",  "nba", "area_team"]])

                      area             nba                          area_team
0            New York City      knicksnets              newyorkcityknicksnets
1              Los Angeles  lakersclippers           losangeleslakersclippers
2   San Francisco Bay Area        warriors        sanfranciscobayareawarriors
3                  Chicago           bulls                       chicagobulls
4        Dallas–Fort Worth       mavericks          dallas–fortworthmavericks
5         Washington, D.C.         wizards             washington,d.c.wizards
6             Philadelphia           76ers                  philadelphia76ers
7                   Boston         celtics                      bostonceltics
8   Minneapolis–Saint Paul    timberwolves  minneapolis–saintpaultimberwolves
9                   Denver         nuggets                      denvernuggets
10   Miami–Fort Lauderdale            heat           miami–fortlauderdaleheat
11                 Phoenix            suns                      

### Clean and explore the NBA dataset

In [6]:
# it is time to consider the nba DataFrame
nba_org = pd.read_csv("utility_files/nba.csv")
nba = nba_org[nba_org['year'] == year]
print(nba.columns)

Index(['team', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'year',
       'League'],
      dtype='object')


In [7]:
cols = ["team", "W", "L"]
nba = nba.loc[:, cols]
print(nba['team'])

0            Toronto Raptors* (1)
1             Boston Celtics* (2)
2         Philadelphia 76ers* (3)
3        Cleveland Cavaliers* (4)
4             Indiana Pacers* (5)
5                 Miami Heat* (6)
6            Milwaukee Bucks* (7)
7         Washington Wizards* (8)
8             Detroit Pistons (9)
9          Charlotte Hornets (10)
10           New York Knicks (11)
11             Brooklyn Nets (12)
12             Chicago Bulls (13)
13             Orlando Magic (14)
14             Atlanta Hawks (15)
15           Houston Rockets* (1)
16     Golden State Warriors* (2)
17    Portland Trail Blazers* (3)
18     Oklahoma City Thunder* (4)
19                 Utah Jazz* (5)
20      New Orleans Pelicans* (6)
21         San Antonio Spurs* (7)
22    Minnesota Timberwolves* (8)
23             Denver Nuggets (9)
24      Los Angeles Clippers (10)
25        Los Angeles Lakers (11)
26          Sacramento Kings (12)
27          Dallas Mavericks (13)
28         Memphis Grizzlies (14)
29            

In [8]:
# at first glance we can detect at least 2 main issues with the team column:
# 1. the need for reformatting the names
# 2. removing the rows declaring the teams' divisions

def clean_team_name_nba(name):
    name_1 = re.sub("\(.*\)", "", name).strip().lower()
    return re.sub("[^(A-z)(a-z)\d\s]", "", name_1).strip().lower()

# addressing problem 1
nba['team'] = nba['team'].apply(clean_team_name_nba)

# setting a custom index
nba = nba.set_index(pd.Index(range(len(nba))))
print(nba)


                      team   W   L
0          toronto raptors  59  23
1           boston celtics  55  27
2       philadelphia 76ers  52  30
3      cleveland cavaliers  50  32
4           indiana pacers  48  34
5               miami heat  44  38
6          milwaukee bucks  44  38
7       washington wizards  43  39
8          detroit pistons  39  43
9        charlotte hornets  36  46
10         new york knicks  29  53
11           brooklyn nets  28  54
12           chicago bulls  27  55
13           orlando magic  25  57
14           atlanta hawks  24  58
15         houston rockets  65  17
16   golden state warriors  58  24
17  portland trail blazers  49  33
18   oklahoma city thunder  48  34
19               utah jazz  48  34
20    new orleans pelicans  48  34
21       san antonio spurs  47  35
22  minnesota timberwolves  47  35
23          denver nuggets  46  36
24    los angeles clippers  42  40
25      los angeles lakers  35  47
26        sacramento kings  27  55
27        dallas mav

### Merge the Cities and NBL datasets
I am associating each NBL team with its area (if the area is included in the cities dataset)

In [9]:
# time to add the area_team name column to the nba DataFrame
nba['area_team'] = nba['team'].apply(lambda x: re.sub("\s","", x).strip().lower())
print(nba)

                      team   W   L              area_team
0          toronto raptors  59  23         torontoraptors
1           boston celtics  55  27          bostonceltics
2       philadelphia 76ers  52  30      philadelphia76ers
3      cleveland cavaliers  50  32     clevelandcavaliers
4           indiana pacers  48  34          indianapacers
5               miami heat  44  38              miamiheat
6          milwaukee bucks  44  38         milwaukeebucks
7       washington wizards  43  39      washingtonwizards
8          detroit pistons  39  43         detroitpistons
9        charlotte hornets  36  46       charlottehornets
10         new york knicks  29  53          newyorkknicks
11           brooklyn nets  28  54           brooklynnets
12           chicago bulls  27  55           chicagobulls
13           orlando magic  25  57           orlandomagic
14           atlanta hawks  24  58           atlantahawks
15         houston rockets  65  17         houstonrockets
16   golden st

In [10]:
# having the area_team column  in common between the two DataFrames we can merge them

merge_areas = pd.merge(cities_nba, nba, how ='left',on=['area_team'])
print(merge_areas.loc[:, ["area", "team"]])

                      area                    team
0            New York City                     NaN
1              Los Angeles                     NaN
2   San Francisco Bay Area                     NaN
3                  Chicago           chicago bulls
4        Dallas–Fort Worth                     NaN
5         Washington, D.C.                     NaN
6             Philadelphia      philadelphia 76ers
7                   Boston          boston celtics
8   Minneapolis–Saint Paul                     NaN
9                   Denver          denver nuggets
10   Miami–Fort Lauderdale                     NaN
11                 Phoenix            phoenix suns
12                 Detroit         detroit pistons
13                 Toronto         toronto raptors
14                 Houston         houston rockets
15                 Atlanta           atlanta hawks
16               Cleveland     cleveland cavaliers
17               Charlotte       charlotte hornets
18            Indianapolis     

In [11]:
merge_teams = pd.merge(nba, cities_nba, how='left', on=['area_team'])
print(merge_teams.loc[:, ["team", "area","area_team"]])

                      team           area              area_team
0          toronto raptors        Toronto         torontoraptors
1           boston celtics         Boston          bostonceltics
2       philadelphia 76ers   Philadelphia      philadelphia76ers
3      cleveland cavaliers      Cleveland     clevelandcavaliers
4           indiana pacers            NaN          indianapacers
5               miami heat            NaN              miamiheat
6          milwaukee bucks      Milwaukee         milwaukeebucks
7       washington wizards            NaN      washingtonwizards
8          detroit pistons        Detroit         detroitpistons
9        charlotte hornets      Charlotte       charlottehornets
10         new york knicks            NaN          newyorkknicks
11           brooklyn nets            NaN           brooklynnets
12           chicago bulls        Chicago           chicagobulls
13           orlando magic        Orlando           orlandomagic
14           atlanta hawk

In [12]:
teams_no_clear_area = merge_teams[merge_teams['area'].isna()]
print(teams_no_clear_area)

                      team   W   L              area_team area  pop  nba
4           indiana pacers  48  34          indianapacers  NaN  NaN  NaN
5               miami heat  44  38              miamiheat  NaN  NaN  NaN
7       washington wizards  43  39      washingtonwizards  NaN  NaN  NaN
10         new york knicks  29  53          newyorkknicks  NaN  NaN  NaN
11           brooklyn nets  28  54           brooklynnets  NaN  NaN  NaN
16   golden state warriors  58  24    goldenstatewarriors  NaN  NaN  NaN
19               utah jazz  48  34               utahjazz  NaN  NaN  NaN
22  minnesota timberwolves  47  35  minnesotatimberwolves  NaN  NaN  NaN
24    los angeles clippers  42  40     losangelesclippers  NaN  NaN  NaN
25      los angeles lakers  35  47       losangeleslakers  NaN  NaN  NaN
27        dallas mavericks  24  58        dallasmavericks  NaN  NaN  NaN


In [13]:
areas_no_clear_team = merge_areas[merge_areas["team"].isna()]

### Processing the no-matches manually
As the area_team can have different value depeding on the dataframe, certain teams end up with no match in the merged dataframe. Some of those teams can be associated easily by manual visual inspection, while other might require further search

In [14]:
# the teams left out with no clear area name are to be processed manually
# first let's consider the possibility of a mapping between the column [area_team] in the nba DF
# and the column [area_team] in the nba_cities DF

area_team_no_match_nba_DF = teams_no_clear_area.set_index(pd.Index(range(len(teams_no_clear_area))))['area_team']
area_team_no_match_cities_nba_DF = areas_no_clear_team.set_index(pd.Index(range(len(areas_no_clear_team))))['area_team']

no_match = pd.DataFrame({"team_no_area": area_team_no_match_nba_DF, "area_no_name": area_team_no_match_cities_nba_DF})
print(no_match)

             team_no_area                       area_no_name
0           indianapacers              newyorkcityknicksnets
1               miamiheat           losangeleslakersclippers
2       washingtonwizards        sanfranciscobayareawarriors
3           newyorkknicks          dallas–fortworthmavericks
4            brooklynnets             washington,d.c.wizards
5     goldenstatewarriors  minneapolis–saintpaultimberwolves
6                utahjazz           miami–fortlauderdaleheat
7   minnesotatimberwolves                 indianapolispacers
8      losangelesclippers                   saltlakecityjazz
9        losangeleslakers                                NaN
10        dallasmavericks                                NaN


In [15]:
# the last dataframe made it easy to see the mapping betweent the two columns
# the following dictionary reflects the mapping

mapping = {"newyorkcityknicksnets": "newyorkknicks", "losangeleslakersclippers": "losangeleslakers", "sanfranciscobayareawarriors":"goldenstatewarriors"
, 'dallas–fortworthmavericks': "dallasmavericks", "washington,d.c.wizards": "washingtonwizards", "minneapolis–saintpaultimberwolves": "minnesotatimberwolves"
, "miami–fortlauderdaleheat": "miamiheat", "indianapolispacers": "indianapacers", "saltlakecityjazz": "utahjazz"}
# the next step is to map the old area_team names in the cities_nba DF to their respective mapped value  

cities_nba['area_team'] = cities_nba['area_team'].apply(lambda x: mapping[x].strip() if x in mapping else x)

merge_teams = pd.merge(nba, cities_nba, how='left', on=['area_team'])
print(merge_teams.loc[: , ['area', 'team', 'area_team']])

                      area                    team              area_team
0                  Toronto         toronto raptors         torontoraptors
1                   Boston          boston celtics          bostonceltics
2             Philadelphia      philadelphia 76ers      philadelphia76ers
3                Cleveland     cleveland cavaliers     clevelandcavaliers
4             Indianapolis          indiana pacers          indianapacers
5    Miami–Fort Lauderdale              miami heat              miamiheat
6                Milwaukee         milwaukee bucks         milwaukeebucks
7         Washington, D.C.      washington wizards      washingtonwizards
8                  Detroit         detroit pistons         detroitpistons
9                Charlotte       charlotte hornets       charlottehornets
10           New York City         new york knicks          newyorkknicks
11                     NaN           brooklyn nets           brooklynnets
12                 Chicago           c

In [16]:
print("LEFT OUT TEAMS")
print(merge_teams[merge_teams['area'].isna()]['team'])
print()
print("AREAS:")
print(cities_nba['area'])

LEFT OUT TEAMS
11           brooklyn nets
24    los angeles clippers
Name: team, dtype: object

AREAS:
0              New York City
1                Los Angeles
2     San Francisco Bay Area
3                    Chicago
4          Dallas–Fort Worth
5           Washington, D.C.
6               Philadelphia
7                     Boston
8     Minneapolis–Saint Paul
9                     Denver
10     Miami–Fort Lauderdale
11                   Phoenix
12                   Detroit
13                   Toronto
14                   Houston
15                   Atlanta
16                 Cleveland
17                 Charlotte
18              Indianapolis
19                 Milwaukee
20               New Orleans
21                   Orlando
22                  Portland
23            Salt Lake City
24               San Antonio
25                Sacramento
26             Oklahoma City
27                   Memphis
Name: area, dtype: object


In [17]:
team_area = {"brooklyn nets": "New York City", "los angeles clippers": "Los Angeles"}

def set_areas(row):
    if row['team'] in team_area:
        row['area'] = team_area[row['team']]
    return row
merge_teams = merge_teams.apply(set_areas, axis=1)

print(merge_teams.loc[: , ['area', 'team', 'area_team']])

                      area                    team              area_team
0                  Toronto         toronto raptors         torontoraptors
1                   Boston          boston celtics          bostonceltics
2             Philadelphia      philadelphia 76ers      philadelphia76ers
3                Cleveland     cleveland cavaliers     clevelandcavaliers
4             Indianapolis          indiana pacers          indianapacers
5    Miami–Fort Lauderdale              miami heat              miamiheat
6                Milwaukee         milwaukee bucks         milwaukeebucks
7         Washington, D.C.      washington wizards      washingtonwizards
8                  Detroit         detroit pistons         detroitpistons
9                Charlotte       charlotte hornets       charlottehornets
10           New York City         new york knicks          newyorkknicks
11           New York City           brooklyn nets           brooklynnets
12                 Chicago           c

### Merge done: Time for statistics

In [18]:
final_df = merge_teams.loc[:, ['area', 'team', 'W', 'L', 'pop']]
final_df['win_loss_ratio'] = final_df['W'].astype(float) / (final_df['W'].astype(float) + final_df['L'].astype(float))
final_df = final_df.loc[:, ['area', 'win_loss_ratio', 'pop']]
# print(final_df)

final_df = final_df.set_index('area').astype(float).groupby('area').agg({"win_loss_ratio":'mean', 'pop':'mean'})
print(final_df)

                        win_loss_ratio         pop
area                                              
Atlanta                       0.292683   5789700.0
Boston                        0.670732   4794447.0
Charlotte                     0.439024   2474314.0
Chicago                       0.329268   9512999.0
Cleveland                     0.609756   2055612.0
Dallas–Fort Worth             0.292683   7233323.0
Denver                        0.560976   2853077.0
Detroit                       0.475610   4297617.0
Houston                       0.792683   6772470.0
Indianapolis                  0.585366   2004230.0
Los Angeles                   0.469512  13310447.0
Memphis                       0.268293   1342842.0
Miami–Fort Lauderdale         0.536585   6066387.0
Milwaukee                     0.536585   1572482.0
Minneapolis–Saint Paul        0.573171   3551036.0
New Orleans                   0.585366   1268883.0
New York City                 0.347561  20153634.0
Oklahoma City                 0

In [19]:
print(final_df.corr())

                win_loss_ratio       pop
win_loss_ratio        1.000000 -0.176572
pop                  -0.176572  1.000000


In [20]:
final_df.to_csv(final_nbl_path)