In [40]:
import pandas as pd
import numpy as np
# import scipy.stats as stats
import re

In [41]:
year = 2018

In [42]:
cities = pd.read_csv("cities.csv")
cities_mlb = cities.loc[:, ['area', 'pop', 'mlb']]

def clean_team_name(name):
    # remove anything written between brackets [] 
    name_1 = re.sub('\[.*\]', "", name)
    # convert to lower case and remove indenting spaces
    return name_1.lower().strip()

cities_mlb['mlb'] = cities_mlb['mlb'].apply(clean_team_name)
# removing non-ascii characters
cities_mlb['mlb'] = cities_mlb['mlb'].apply(lambda x: re.sub("[^\x00-\xFF]", "", x)) 
# final cleaning step
cities_mlb['mlb'] = cities_mlb['mlb'].apply(lambda x: re.sub("[^(A-Z)(a-z)\d\s]", "", x))

# at this point cities with no mlb team are assigned the empty string in the "mlb" column
# keep the cities with mlb teams
cities_mlb = cities_mlb[cities_mlb['mlb'] != ''] 
print(cities_mlb)
# set the index to a numerical series from 0 to the size of the dataframe
custom_index = pd.Index(range(len(cities_mlb)))
cities_mlb = cities_mlb.set_index(custom_index)


                      area       pop              mlb
0            New York City  20153634      yankeesmets
1              Los Angeles  13310447    dodgersangels
2   San Francisco Bay Area   6657982  giantsathletics
3                  Chicago   9512999    cubswhite sox
4        Dallas–Fort Worth   7233323          rangers
5         Washington, D.C.   6131977        nationals
6             Philadelphia   6070500         phillies
7                   Boston   4794447          red sox
8   Minneapolis–Saint Paul   3551036            twins
9                   Denver   2853077          rockies
10   Miami–Fort Lauderdale   6066387          marlins
11                 Phoenix   4661537     diamondbacks
12                 Detroit   4297617           tigers
13                 Toronto   5928040        blue jays
14                 Houston   6772470           astros
15                 Atlanta   5789700           braves
16          Tampa Bay Area   3032171             rays
17              Pittsburgh  

In [43]:
# in order to map each team with its area, a new column should be added 
# that groups both the area/city name as well as the team's name

def area_team(row):
    area_no_space = re.sub("\s", "", row['area']).strip().lower()
    team_no_space = re.sub("\s", "", row['mlb']).strip().lower()
    return area_no_space + team_no_space

cities_mlb['area_team'] = cities_mlb.apply(area_team, axis=1)
print(cities_mlb.loc[:, ["area",  "mlb", "area_team"]])

                      area              mlb  \
0            New York City      yankeesmets   
1              Los Angeles    dodgersangels   
2   San Francisco Bay Area  giantsathletics   
3                  Chicago    cubswhite sox   
4        Dallas–Fort Worth          rangers   
5         Washington, D.C.        nationals   
6             Philadelphia         phillies   
7                   Boston          red sox   
8   Minneapolis–Saint Paul            twins   
9                   Denver          rockies   
10   Miami–Fort Lauderdale          marlins   
11                 Phoenix     diamondbacks   
12                 Detroit           tigers   
13                 Toronto        blue jays   
14                 Houston           astros   
15                 Atlanta           braves   
16          Tampa Bay Area             rays   
17              Pittsburgh          pirates   
18               Cleveland          indians   
19                 Seattle         mariners   
20           

In [44]:
mlb_org = pd.read_csv("utility_files/mlb.csv")
mlb = mlb_org[mlb_org['year'] == year]
print(mlb.columns)

Index(['team', 'W', 'L', 'W-L%', 'GB', 'year', 'League'], dtype='object')


In [45]:
cols = ["team", "W", "L"]
mlb = mlb.loc[:, cols]
print(mlb['team'])

0            Boston Red Sox
1          New York Yankees
2            Tampa Bay Rays
3         Toronto Blue Jays
4         Baltimore Orioles
5         Cleveland Indians
6           Minnesota Twins
7            Detroit Tigers
8         Chicago White Sox
9        Kansas City Royals
10           Houston Astros
11        Oakland Athletics
12         Seattle Mariners
13       Los Angeles Angels
14            Texas Rangers
15           Atlanta Braves
16     Washington Nationals
17    Philadelphia Phillies
18            New York Mets
19            Miami Marlins
20        Milwaukee Brewers
21             Chicago Cubs
22      St. Louis Cardinals
23       Pittsburgh Pirates
24          Cincinnati Reds
25      Los Angeles Dodgers
26         Colorado Rockies
27     Arizona Diamondbacks
28     San Francisco Giants
29         San Diego Padres
Name: team, dtype: object


In [46]:
# time to add the area_team name column to the mlb DataFrame
mlb['area_team'] = mlb['team'].apply(lambda x: re.sub("\s","", x).strip().lower())
print(mlb)

                     team    W    L             area_team
0          Boston Red Sox  108   54          bostonredsox
1        New York Yankees  100   62        newyorkyankees
2          Tampa Bay Rays   90   72          tampabayrays
3       Toronto Blue Jays   73   89       torontobluejays
4       Baltimore Orioles   47  115      baltimoreorioles
5       Cleveland Indians   91   71      clevelandindians
6         Minnesota Twins   78   84        minnesotatwins
7          Detroit Tigers   64   98         detroittigers
8       Chicago White Sox   62  100       chicagowhitesox
9      Kansas City Royals   58  104      kansascityroyals
10         Houston Astros  103   59         houstonastros
11      Oakland Athletics   97   65      oaklandathletics
12       Seattle Mariners   89   73       seattlemariners
13     Los Angeles Angels   80   82      losangelesangels
14          Texas Rangers   67   95          texasrangers
15         Atlanta Braves   90   72         atlantabraves
16   Washingto

In [47]:
# having the area_team column  in common between the two DataFrames we can merge them

merge_areas = pd.merge(cities_mlb, mlb, how ='left',on=['area_team'])
print(merge_areas.loc[:, ["area", "team"]])

                      area                   team
0            New York City                    NaN
1              Los Angeles                    NaN
2   San Francisco Bay Area                    NaN
3                  Chicago                    NaN
4        Dallas–Fort Worth                    NaN
5         Washington, D.C.                    NaN
6             Philadelphia  Philadelphia Phillies
7                   Boston         Boston Red Sox
8   Minneapolis–Saint Paul                    NaN
9                   Denver                    NaN
10   Miami–Fort Lauderdale                    NaN
11                 Phoenix                    NaN
12                 Detroit         Detroit Tigers
13                 Toronto      Toronto Blue Jays
14                 Houston         Houston Astros
15                 Atlanta         Atlanta Braves
16          Tampa Bay Area                    NaN
17              Pittsburgh     Pittsburgh Pirates
18               Cleveland      Cleveland Indians


In [48]:

merge_areas = pd.merge(cities_mlb, mlb, how ='left',on=['area_team'])
print(merge_areas.loc[:, ["area", "team"]])

                      area                   team
0            New York City                    NaN
1              Los Angeles                    NaN
2   San Francisco Bay Area                    NaN
3                  Chicago                    NaN
4        Dallas–Fort Worth                    NaN
5         Washington, D.C.                    NaN
6             Philadelphia  Philadelphia Phillies
7                   Boston         Boston Red Sox
8   Minneapolis–Saint Paul                    NaN
9                   Denver                    NaN
10   Miami–Fort Lauderdale                    NaN
11                 Phoenix                    NaN
12                 Detroit         Detroit Tigers
13                 Toronto      Toronto Blue Jays
14                 Houston         Houston Astros
15                 Atlanta         Atlanta Braves
16          Tampa Bay Area                    NaN
17              Pittsburgh     Pittsburgh Pirates
18               Cleveland      Cleveland Indians


In [49]:
merge_teams = pd.merge(mlb, cities_mlb, how='left', on=['area_team'])
print(merge_teams.loc[:, ["team", "area","area_team"]])

                     team          area             area_team
0          Boston Red Sox        Boston          bostonredsox
1        New York Yankees           NaN        newyorkyankees
2          Tampa Bay Rays           NaN          tampabayrays
3       Toronto Blue Jays       Toronto       torontobluejays
4       Baltimore Orioles     Baltimore      baltimoreorioles
5       Cleveland Indians     Cleveland      clevelandindians
6         Minnesota Twins           NaN        minnesotatwins
7          Detroit Tigers       Detroit         detroittigers
8       Chicago White Sox           NaN       chicagowhitesox
9      Kansas City Royals   Kansas City      kansascityroyals
10         Houston Astros       Houston         houstonastros
11      Oakland Athletics           NaN      oaklandathletics
12       Seattle Mariners       Seattle       seattlemariners
13     Los Angeles Angels           NaN      losangelesangels
14          Texas Rangers           NaN          texasrangers
15      

In [50]:
teams_no_clear_area = merge_teams[merge_teams['area'].isna()]
print(teams_no_clear_area)

                    team    W    L            area_team area  pop  mlb
1       New York Yankees  100   62       newyorkyankees  NaN  NaN  NaN
2         Tampa Bay Rays   90   72         tampabayrays  NaN  NaN  NaN
6        Minnesota Twins   78   84       minnesotatwins  NaN  NaN  NaN
8      Chicago White Sox   62  100      chicagowhitesox  NaN  NaN  NaN
11     Oakland Athletics   97   65     oaklandathletics  NaN  NaN  NaN
13    Los Angeles Angels   80   82     losangelesangels  NaN  NaN  NaN
14         Texas Rangers   67   95         texasrangers  NaN  NaN  NaN
16  Washington Nationals   82   80  washingtonnationals  NaN  NaN  NaN
18         New York Mets   77   85          newyorkmets  NaN  NaN  NaN
19         Miami Marlins   63   98         miamimarlins  NaN  NaN  NaN
21          Chicago Cubs   95   68          chicagocubs  NaN  NaN  NaN
25   Los Angeles Dodgers   92   71    losangelesdodgers  NaN  NaN  NaN
26      Colorado Rockies   91   72      coloradorockies  NaN  NaN  NaN
27  Ar

In [51]:
areas_no_clear_team = merge_areas[merge_areas["team"].isna()]

In [52]:
# the teams left out with no clear area name are to be processed manually
# first let's consider the possibility of a mapping between the column [area_team] in the mlb DF
# and the column [area_team] in the mlb_cities DF

area_team_no_match_mlb_DF = teams_no_clear_area.set_index(pd.Index(range(len(teams_no_clear_area))))['area_team']
area_team_no_match_cities_mlb_DF = areas_no_clear_team.set_index(pd.Index(range(len(areas_no_clear_team))))['area_team']

no_match = pd.DataFrame({"team_no_area": area_team_no_match_mlb_DF, "area_no_name": area_team_no_match_cities_mlb_DF})
print(no_match)

           team_no_area                        area_no_name
0        newyorkyankees              newyorkcityyankeesmets
1          tampabayrays             losangelesdodgersangels
2        minnesotatwins  sanfranciscobayareagiantsathletics
3       chicagowhitesox                 chicagocubswhitesox
4      oaklandathletics             dallas–fortworthrangers
5      losangelesangels            washington,d.c.nationals
6          texasrangers          minneapolis–saintpaultwins
7   washingtonnationals                       denverrockies
8           newyorkmets         miami–fortlauderdalemarlins
9          miamimarlins                 phoenixdiamondbacks
10          chicagocubs                    tampabayarearays
11    losangelesdodgers                                 NaN
12      coloradorockies                                 NaN
13  arizonadiamondbacks                                 NaN
14   sanfranciscogiants                                 NaN


In [53]:
# the last dataframe made it easy to see the mapping betweent the two columns
# the following dictionary reflects the mapping

mapping = {"newyorkcityyankeesmets": "newyorkyankees", "losangelesdodgersangels": "losangelesdodgers", "sanfranciscobayareagiantsathletics":"sanfranciscogiants"
, 'chicagocubswhitesox': "chicagowhitesox", "washington,d.c.nationals": "washingtonnationals", "minneapolis–saintpaultwins": "minnesotatwins"
, "denverrockies": "coloradorockies", "miami–fortlauderdalemarlins": "miamimarlins", "phoenixdiamondbacks": "arizonadiamondbacks", 
"tampabayarearays": "tampabayrays", "dallas–fortworthrangers":"texasrangers"}

# the next step is to map the old area_team names in the cities_mlb DF to their respective mapped value  

cities_mlb['area_team'] = cities_mlb['area_team'].apply(lambda x: mapping[x].strip() if x in mapping else x)

merge_teams = pd.merge(mlb, cities_mlb, how='left', on=['area_team'])
print(merge_teams.loc[: , ['area', 'team', 'area_team']])

                      area                   team             area_team
0                   Boston         Boston Red Sox          bostonredsox
1            New York City       New York Yankees        newyorkyankees
2           Tampa Bay Area         Tampa Bay Rays          tampabayrays
3                  Toronto      Toronto Blue Jays       torontobluejays
4                Baltimore      Baltimore Orioles      baltimoreorioles
5                Cleveland      Cleveland Indians      clevelandindians
6   Minneapolis–Saint Paul        Minnesota Twins        minnesotatwins
7                  Detroit         Detroit Tigers         detroittigers
8                  Chicago      Chicago White Sox       chicagowhitesox
9              Kansas City     Kansas City Royals      kansascityroyals
10                 Houston         Houston Astros         houstonastros
11                     NaN      Oakland Athletics      oaklandathletics
12                 Seattle       Seattle Mariners       seattlem

In [54]:
# as expected there are 3 teams with no associated area, the next stop now is to associate these teams with one
# of areas provided. This task requires more human understanding of the data and a little bit of research

print("LEFT OUT TEAMS")
print(merge_teams[merge_teams['area'].isna()]['team'])
print()

print("AREAS:")
print(cities_mlb['area'])


LEFT OUT TEAMS
11     Oakland Athletics
13    Los Angeles Angels
18         New York Mets
21          Chicago Cubs
Name: team, dtype: object

AREAS:
0              New York City
1                Los Angeles
2     San Francisco Bay Area
3                    Chicago
4          Dallas–Fort Worth
5           Washington, D.C.
6               Philadelphia
7                     Boston
8     Minneapolis–Saint Paul
9                     Denver
10     Miami–Fort Lauderdale
11                   Phoenix
12                   Detroit
13                   Toronto
14                   Houston
15                   Atlanta
16            Tampa Bay Area
17                Pittsburgh
18                 Cleveland
19                   Seattle
20                Cincinnati
21               Kansas City
22                 St. Louis
23                 Baltimore
24                 Milwaukee
25                 San Diego
Name: area, dtype: object


In [55]:
team_area = {"Oakland Athletics": "San Francisco Bay Area", "Los Angeles Angels": "Los Angeles", "New York Mets": "New York City", "Chicago Cubs": "Chicago"}

def set_areas(row):
    if row['team'] in team_area:
        row['area'] = team_area[row['team']]
    return row
merge_teams = merge_teams.apply(set_areas, axis=1)
print(merge_teams.loc[: , ['area', 'team', 'area_team']])


                      area                   team             area_team
0                   Boston         Boston Red Sox          bostonredsox
1            New York City       New York Yankees        newyorkyankees
2           Tampa Bay Area         Tampa Bay Rays          tampabayrays
3                  Toronto      Toronto Blue Jays       torontobluejays
4                Baltimore      Baltimore Orioles      baltimoreorioles
5                Cleveland      Cleveland Indians      clevelandindians
6   Minneapolis–Saint Paul        Minnesota Twins        minnesotatwins
7                  Detroit         Detroit Tigers         detroittigers
8                  Chicago      Chicago White Sox       chicagowhitesox
9              Kansas City     Kansas City Royals      kansascityroyals
10                 Houston         Houston Astros         houstonastros
11  San Francisco Bay Area      Oakland Athletics      oaklandathletics
12                 Seattle       Seattle Mariners       seattlem

In [56]:
final_df = merge_teams.loc[:, ['area', 'team', 'W', 'L', 'pop']]
final_df['win_loss_ratio'] = final_df['W'].astype(float) / (final_df['W'].astype(float) + final_df['L'].astype(float))
final_df = final_df.loc[:, ['area', 'win_loss_ratio', 'pop']]
# print(final_df)

final_df = final_df.set_index('area').astype(float).groupby('area').agg({"win_loss_ratio":'mean', 'pop':'mean'})
print(final_df)

                        win_loss_ratio         pop
area                                              
Atlanta                       0.555556   5789700.0
Baltimore                     0.290123   2798886.0
Boston                        0.666667   4794447.0
Chicago                       0.482769   9512999.0
Cincinnati                    0.413580   2165139.0
Cleveland                     0.561728   2055612.0
Dallas–Fort Worth             0.413580   7233323.0
Denver                        0.558282   2853077.0
Detroit                       0.395062   4297617.0
Houston                       0.635802   6772470.0
Kansas City                   0.358025   2104509.0
Los Angeles                   0.529122  13310447.0
Miami–Fort Lauderdale         0.391304   6066387.0
Milwaukee                     0.588957   1572482.0
Minneapolis–Saint Paul        0.481481   3551036.0
New York City                 0.546296  20153634.0
Philadelphia                  0.493827   6070500.0
Phoenix                       0

In [57]:
print(final_df.corr())

                win_loss_ratio       pop
win_loss_ratio        1.000000  0.150277
pop                   0.150277  1.000000


In [58]:
final_df.to_csv("/home/ayhem18/Ayhem18/DEV/Data_science/Towards_Data_science/Programming_Tools/Pandas_Numpy/mlb_final.csv")