In [1]:
import pandas as pd
import numpy as np
# import scipy.stats as stats
import re

In [2]:
year = 2018

In [3]:
cities = pd.read_csv("cities.csv")
cities_nfl = cities.loc[:, ['area', 'pop', 'nfl']]
print(cities_nfl['nfl'])

0       GiantsJets[note 1]
1     RamsChargers[note 4]
2     49ersRaiders[note 6]
3            Bears[note 8]
4                  Cowboys
5                 Redskins
6                   Eagles
7        Patriots[note 14]
8                  Vikings
9                  Broncos
10                Dolphins
11               Cardinals
12                   Lions
13             — [note 22]
14         Texans[note 24]
15                 Falcons
16              Buccaneers
17                Steelers
18         Browns[note 29]
19                Seahawks
20                 Bengals
21                  Chiefs
22               [note 40]
23         Ravens[note 45]
24                Panthers
25                   Colts
26                  Titans
27               [note 53]
28                  Saints
29          Bills[note 56]
30                       —
31                       —
32                       —
33                       —
34                       —
35                       —
36                       —
3

In [4]:
def clean_team_name(name):
    # remove anything written between brackets [] 
    name_1 = re.sub('\[.*\]', "", name)
    # convert to lower case and remove indenting spaces
    return name_1.lower().strip()

cities_nfl['nfl'] = cities_nfl['nfl'].apply(clean_team_name)
# removing non-ascii characters
cities_nfl['nfl'] = cities_nfl['nfl'].apply(lambda x: re.sub("[^\x00-\xFF]", "", x)) 
# final cleaning step
cities_nfl['nfl'] = cities_nfl['nfl'].apply(lambda x: re.sub("[^(A-Z)(a-z)\d\s]", "", x))

# at this point cities with no nfl team are assigned the empty string in the "nfl" column
# keep the cities with nfl teams
cities_nfl = cities_nfl[cities_nfl['nfl'] != ''] 
print(cities_nfl)
# set the index to a numerical series from 0 to the size of the dataframe
custom_index = pd.Index(range(len(cities_nfl)))
cities_nfl = cities_nfl.set_index(custom_index)



                      area       pop           nfl
0            New York City  20153634    giantsjets
1              Los Angeles  13310447  ramschargers
2   San Francisco Bay Area   6657982  49ersraiders
3                  Chicago   9512999         bears
4        Dallas–Fort Worth   7233323       cowboys
5         Washington, D.C.   6131977      redskins
6             Philadelphia   6070500        eagles
7                   Boston   4794447      patriots
8   Minneapolis–Saint Paul   3551036       vikings
9                   Denver   2853077       broncos
10   Miami–Fort Lauderdale   6066387      dolphins
11                 Phoenix   4661537     cardinals
12                 Detroit   4297617         lions
14                 Houston   6772470        texans
15                 Atlanta   5789700       falcons
16          Tampa Bay Area   3032171    buccaneers
17              Pittsburgh   2342299      steelers
18               Cleveland   2055612        browns
19                 Seattle   37

In [5]:
# in order to map each team with its area, a new column should be added 
# that groups both the area/city name as well as the team's name

def area_team(row):
    area_no_space = re.sub("\s", "", row['area']).strip().lower()
    team_no_space = re.sub("\s", "", row['nfl']).strip().lower()
    return area_no_space + team_no_space

cities_nfl['area_team'] = cities_nfl.apply(area_team, axis=1)


In [6]:
nfl_org = pd.read_csv("utility_files/nfl.csv")
nfl = nfl_org[nfl_org['year'] == year]


In [7]:
cols = ["team", "W", "L"]
nfl = nfl.loc[:, cols]
print(nfl['team'])

0                  AFC East
1     New England Patriots*
2            Miami Dolphins
3             Buffalo Bills
4             New York Jets
5                 AFC North
6         Baltimore Ravens*
7       Pittsburgh Steelers
8          Cleveland Browns
9        Cincinnati Bengals
10                AFC South
11          Houston Texans*
12      Indianapolis Colts+
13         Tennessee Titans
14     Jacksonville Jaguars
15                 AFC West
16      Kansas City Chiefs*
17    Los Angeles Chargers+
18           Denver Broncos
19          Oakland Raiders
20                 NFC East
21          Dallas Cowboys*
22     Philadelphia Eagles+
23      Washington Redskins
24          New York Giants
25                NFC North
26           Chicago Bears*
27        Minnesota Vikings
28        Green Bay Packers
29            Detroit Lions
30                NFC South
31      New Orleans Saints*
32        Carolina Panthers
33          Atlanta Falcons
34     Tampa Bay Buccaneers
35                 N

In [8]:
def clean_team_name_nfl(name):
    name_1 = re.sub("\(.*\)", "", name).strip().lower()
    return re.sub("[^(A-z)(a-z)\d\s]", "", name_1).strip().lower()

# addressing problem 1
nfl['team'] = nfl['team'].apply(clean_team_name_nfl)

nfl = nfl[~(nfl['team'].str.contains("afc") | nfl['team'].str.contains("nfc"))]

# setting a custom index
nfl = nfl.set_index(pd.Index(range(len(nfl))))
print(nfl)


                    team   W   L
0   new england patriots  11   5
1         miami dolphins   7   9
2          buffalo bills   6  10
3          new york jets   4  12
4       baltimore ravens  10   6
5    pittsburgh steelers   9   6
6       cleveland browns   7   8
7     cincinnati bengals   6  10
8         houston texans  11   5
9     indianapolis colts  10   6
10      tennessee titans   9   7
11  jacksonville jaguars   5  11
12    kansas city chiefs  12   4
13  los angeles chargers  12   4
14        denver broncos   6  10
15       oakland raiders   4  12
16        dallas cowboys  10   6
17   philadelphia eagles   9   7
18   washington redskins   7   9
19       new york giants   5  11
20         chicago bears  12   4
21     minnesota vikings   8   7
22     green bay packers   6   9
23         detroit lions   6  10
24    new orleans saints  13   3
25     carolina panthers   7   9
26       atlanta falcons   7   9
27  tampa bay buccaneers   5  11
28      los angeles rams  13   3
29      se

In [9]:
# time to add the area_team name column to the nfl DataFrame
nfl['area_team'] = nfl['team'].apply(lambda x: re.sub("\s","", x).strip().lower())


In [10]:
merge_areas = pd.merge(cities_nfl, nfl, how ='left',on=['area_team'])

In [11]:
merge_teams = pd.merge(nfl, cities_nfl, how='left', on=['area_team'])


In [12]:
teams_no_clear_area = merge_teams[merge_teams['area'].isna()]
areas_no_clear_team = merge_areas[merge_areas["team"].isna()]

In [13]:
# the teams left out with no clear area name are to be processed manually
# first let's consider the possibility of a mapping between the column [area_team] in the nfl DF
# and the column [area_team] in the nfl_cities DF

area_team_no_match_nfl_DF = teams_no_clear_area.set_index(pd.Index(range(len(teams_no_clear_area))))['area_team']
area_team_no_match_cities_nfl_DF = areas_no_clear_team.set_index(pd.Index(range(len(areas_no_clear_team))))['area_team']

no_match = pd.DataFrame({"team_no_area": area_team_no_match_nfl_DF, "area_no_name": area_team_no_match_cities_nfl_DF})
print(no_match)

          team_no_area                     area_no_name
0   newenglandpatriots            newyorkcitygiantsjets
1        miamidolphins           losangelesramschargers
2          newyorkjets  sanfranciscobayarea49ersraiders
3      tennesseetitans          dallas–fortworthcowboys
4   losangeleschargers          washington,d.c.redskins
5       oaklandraiders                   bostonpatriots
6        dallascowboys     minneapolis–saintpaulvikings
7   washingtonredskins     miami–fortlauderdaledolphins
8        newyorkgiants                 phoenixcardinals
9     minnesotavikings           tampabayareabuccaneers
10    carolinapanthers                charlottepanthers
11  tampabaybuccaneers                  nashvilletitans
12      losangelesrams                              NaN
13   sanfrancisco49ers                              NaN
14    arizonacardinals                              NaN


In [14]:
# the last dataframe made it easy to see the mapping betweent the two columns
# the following dictionary reflects the mapping

mapping = {"newyorkcitygiantsjets": "newyorkjets", "losangelesramschargers": "losangelesrams", "sanfranciscobayarea49ersraiders":"sanfrancisco49ers"
, 'dallas–fortworthcowboys': "dallascowboys", "washington,d.c.redskins": "washingtonredskins", "bostonpatriots": "newenglandpatriots"
, "minneapolis–saintpaulvikings": "minnesotavikings", "miami–fortlauderdaledolphins": "miamidolphins", "phoenixcardinals": "arizonacardinals"
, "tampabayareabuccaneers": "tampabaybuccaneers", "charlottepanthers": "carolinapanthers", "nashvilletitans": "tennesseetitans"}
# the next step is to map the old area_team names in the cities_nfl DF to their respective mapped value  

cities_nfl['area_team'] = cities_nfl['area_team'].apply(lambda x: mapping[x].strip() if x in mapping else x)

merge_teams = pd.merge(nfl, cities_nfl, how='left', on=['area_team'])
print(merge_teams.loc[: , ['area', 'team', 'area_team']])

                      area                  team            area_team
0                   Boston  new england patriots   newenglandpatriots
1    Miami–Fort Lauderdale        miami dolphins        miamidolphins
2                  Buffalo         buffalo bills         buffalobills
3            New York City         new york jets          newyorkjets
4                Baltimore      baltimore ravens      baltimoreravens
5               Pittsburgh   pittsburgh steelers   pittsburghsteelers
6                Cleveland      cleveland browns      clevelandbrowns
7               Cincinnati    cincinnati bengals    cincinnatibengals
8                  Houston        houston texans        houstontexans
9             Indianapolis    indianapolis colts    indianapoliscolts
10               Nashville      tennessee titans      tennesseetitans
11            Jacksonville  jacksonville jaguars  jacksonvillejaguars
12             Kansas City    kansas city chiefs     kansascitychiefs
13                  

In [15]:
# as expected there are 3 teams with no associated area, the next stop now is to associate these teams with one
# of areas provided. This task requires more human understanding of the data and a little bit of research

print("LEFT OUT TEAMS")
print(merge_teams[merge_teams['area'].isna()]['team'])
print()
print("AREAS:")
print(cities_nfl['area'])


LEFT OUT TEAMS
13    los angeles chargers
15         oakland raiders
19         new york giants
Name: team, dtype: object

AREAS:
0              New York City
1                Los Angeles
2     San Francisco Bay Area
3                    Chicago
4          Dallas–Fort Worth
5           Washington, D.C.
6               Philadelphia
7                     Boston
8     Minneapolis–Saint Paul
9                     Denver
10     Miami–Fort Lauderdale
11                   Phoenix
12                   Detroit
13                   Houston
14                   Atlanta
15            Tampa Bay Area
16                Pittsburgh
17                 Cleveland
18                   Seattle
19                Cincinnati
20               Kansas City
21                 Baltimore
22                 Charlotte
23              Indianapolis
24                 Nashville
25               New Orleans
26                   Buffalo
27              Jacksonville
28                 Green Bay
Name: area, dtype: object


In [16]:
team_area = {"los angeles chargers": "Los Angeles", "oakland raiders": "San Francisco Bay Area", "new york giants": "New York City"}


def set_areas(row):
    if row['team'] in team_area:
        row['area'] = team_area[row['team']]
    return row
merge_teams = merge_teams.apply(set_areas, axis=1)

print(merge_teams.loc[: , ['area', 'team', 'area_team']])



                      area                  team            area_team
0                   Boston  new england patriots   newenglandpatriots
1    Miami–Fort Lauderdale        miami dolphins        miamidolphins
2                  Buffalo         buffalo bills         buffalobills
3            New York City         new york jets          newyorkjets
4                Baltimore      baltimore ravens      baltimoreravens
5               Pittsburgh   pittsburgh steelers   pittsburghsteelers
6                Cleveland      cleveland browns      clevelandbrowns
7               Cincinnati    cincinnati bengals    cincinnatibengals
8                  Houston        houston texans        houstontexans
9             Indianapolis    indianapolis colts    indianapoliscolts
10               Nashville      tennessee titans      tennesseetitans
11            Jacksonville  jacksonville jaguars  jacksonvillejaguars
12             Kansas City    kansas city chiefs     kansascitychiefs
13             Los A

In [17]:
final_df = merge_teams.loc[:, ['area', 'team', 'W', 'L', 'pop']]
final_df['win_loss_ratio'] = final_df['W'].astype(float) / (final_df['W'].astype(float) + final_df['L'].astype(float))
final_df = final_df.loc[:, ['area', 'win_loss_ratio', 'pop']]
# print(final_df)

final_df = final_df.set_index('area').astype(float).groupby('area').agg({"win_loss_ratio":'mean', 'pop':'mean'})
print(final_df)

                        win_loss_ratio         pop
area                                              
Atlanta                       0.437500   5789700.0
Baltimore                     0.625000   2798886.0
Boston                        0.687500   4794447.0
Buffalo                       0.375000   1132804.0
Charlotte                     0.437500   2474314.0
Chicago                       0.750000   9512999.0
Cincinnati                    0.375000   2165139.0
Cleveland                     0.466667   2055612.0
Dallas–Fort Worth             0.625000   7233323.0
Denver                        0.375000   2853077.0
Detroit                       0.375000   4297617.0
Green Bay                     0.400000    318236.0
Houston                       0.687500   6772470.0
Indianapolis                  0.625000   2004230.0
Jacksonville                  0.312500   1478212.0
Kansas City                   0.750000   2104509.0
Los Angeles                   0.781250  13310447.0
Miami–Fort Lauderdale         0

In [18]:
print(final_df.corr())

                win_loss_ratio       pop
win_loss_ratio        1.000000  0.004922
pop                   0.004922  1.000000


In [20]:
final_df.to_csv("/home/ayhem18/Ayhem18/DEV/Data_science/Towards_Data_science/Programming_Tools/Pandas_Numpy/nfl_final.csv")