# Phase 2 - Summary Statistics and Visualizations

Since we have concluded using the Dark Sky API in Part 1 and we don't want to rerun that script,
we saved the results into the football_weather.csv file. 

In phase 2, we are creating our summary statistics and visualizations.   Our project goals were as follows.

Goals: Create a record of each unique team in the "database.sqlite" dataset including:

- the name of the team
- total goals scored during 2011 season
- total number of wins earned in 2011 season
- visualization of the w/l for 2011
- win % in rain for 2011 season

In [1]:
# Import our libraries and read in our data file.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pymongo
%matplotlib inline

df = pd.read_csv("football_weather.csv")

In [2]:
# Clean up.

# drop columns that are not useful
matches = df.drop(["Match_ID","Div","Season", "Team", "City", "Stadium", "Capacity", "new", "FDCOUK", "Latitude", "Longitude", "Country", "unixtime"], axis = 1)

# we had 36 nan's in the rain lookup, we could either drop it, but it seems the majority of our contest are not rain
# I'm making the decision to fill those with 0's so we have the complete schedule intact.

matches["rain"].fillna(0, inplace = True) 

#sanity check on the data.
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 7 columns):
Date        992 non-null object
HomeTeam    992 non-null object
AwayTeam    992 non-null object
FTHG        992 non-null int64
FTAG        992 non-null int64
FTR         992 non-null object
rain        992 non-null float64
dtypes: float64(1), int64(2), object(4)
memory usage: 38.8+ KB


In [3]:
# We are sorting our resulating DF by date so our visuals will be chronological.
matches = matches.sort_values(by='Date')

In [4]:
# Taking a team by team approach to fill our data, means creating a team-based dataframe.   

# The df should be the entire set of matches in which the team is involved as either the Home or Away teams.
# we used the .copy() method to make sure we didn't slice a bit of our code, but rather created a df for that team.

# this static representation of our team-based dataframe helps us develop the code to systematically create them in our class.

Nurnberg = matches[ matches['HomeTeam'].str.match("Nurnberg") | matches['AwayTeam'].str.match("Nurnberg") ].copy()
Nurnberg.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,rain
200,2011-08-06,Hertha,Nurnberg,0,1,A,0.5
279,2011-08-13,Nurnberg,Hannover,1,2,A,1.0
203,2011-08-20,Dortmund,Nurnberg,2,0,H,0.0
527,2011-08-27,Nurnberg,Augsburg,1,0,H,1.0
198,2011-09-11,FC Koln,Nurnberg,1,2,A,1.0


In [5]:
# the purpose of this class is to calculate team-based statistics required to be uploaded to our mongoDB.

class team():
    
    def __init__ (self, name, df):
        
        self.name = name
        self.df = df

        # calculating goal differential needed us to adjust for when they were home vs away.
        df["gd"] = df.apply(lambda x: (x["FTHG"] - x["FTAG"]) 
            if (x['HomeTeam'] == name) else (x["FTAG"] - x["FTHG"]), axis=1)

        self.games = team.getGamesPlayed(df, name)
        self.wins = team.getWins(df, name)
        self.goals = team.getGoals(df, name)
        self.rain = team.getRain(df, name)
    
    # return the number of games played
    def getGamesPlayed(df, name):
        return len(df)
    
    # return total wins in the season
    def getWins(df, name):
        temp = df.apply(lambda df: True if (
            ((df["HomeTeam"] == name) & (df["gd"] > 0)) |
            ((df["AwayTeam"] == name) & (df["gd"] < 0))) else False, axis =1)
        return len(temp[temp == True].index)
    
    # return win% in rain in form of a decimal, with precipation percentage ("Rain") being over 0.5
    # 
    # the sum(temp) was an ingenious way to get the count of wins meeting our rain condition.
    
    def getRain(df, name):
        temp = df.apply(lambda df: True if ( 
            (df["HomeTeam"] == name) & (df["gd"] > 0) & (df["rain"] >= 0.5) |
            (df["AwayTeam"] == name) & (df["gd"] > 0) & (df["rain"] >= 0.5)
            )
            else False, axis =1)
        return (sum(temp) / len(df[df["rain"] >= 0.5]))
 
    # return total goals scored in the season
    def getGoals(df, name):
        hg = df[df["HomeTeam"]==name]["FTHG"].sum()
        ag = df[df["AwayTeam"]==name]["FTAG"].sum()
        return hg+ag
    
    # return a histogram of goal differentials experienced during the season.
    def getPlot(self, df, name):
    # create a new figure
        fig = plt.hist(df.gd, bins=(df.gd.max() - df.gd.min()), 
            histtype='stepfilled', color='steelblue', edgecolor='black')
        plt.xlabel('Goals')
        plt.ylabel('Count')
        plt.title(name + ' Goal Differential')
        plt.close()
        return fig
    
    # return a time-series plot of goal differential per game.
    def getGdPlot(self,df,name):
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.bar(Nurnberg.Date, Nurnberg.gd, align='center', alpha=0.5)
        plt.xticks(Nurnberg.Date, rotation = "vertical")
        plt.ylabel('Goal Differential')
        plt.title("Goal Differential Time Series of " + name + ' Through the 2011 Season')
        plt.close()
        return fig

# First, a static run to make sure everything works.
we will continue to use Nurnberg from above.

In [6]:
# Instantiate the Nurnberg class by passing in String TeamName and DF team_df
N = team("Nurnberg", Nurnberg)

In [7]:
# checking if everything is still kosher
N.df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,rain,gd
200,2011-08-06,Hertha,Nurnberg,0,1,A,0.5,1
279,2011-08-13,Nurnberg,Hannover,1,2,A,1.0,-1
203,2011-08-20,Dortmund,Nurnberg,2,0,H,0.0,-2
527,2011-08-27,Nurnberg,Augsburg,1,0,H,1.0,1
198,2011-09-11,FC Koln,Nurnberg,1,2,A,1.0,1


In [8]:
type(N)

__main__.team

In [9]:
# Let's take a look at the plots we're generating.  The first is a histogram to satisfy the requirements of the project.
# The second is much more interesting, since it's depicting the time series of goal differential through the entire season.
team_plot = N.getPlot(N.df, N.name)
team_gd = N.getGdPlot(N.df, N.name)

# Phase 3 - Upload to MongoDB

Our static test look good, so lets spin up our MongoDB server and start loading our statistics.

In [10]:
# creating the server
team_db = "football_team_db"
mongo_server = "mongodb://localhost:27017"

In [11]:
# instantiating our db
myclient = pymongo.MongoClient(mongo_server)
mydb = myclient[team_db]

In [12]:
# creating our football collection
fb_collection = mydb['collection']
type(fb_collection)

pymongo.collection.Collection

# Uploading our summary statistics.

In [13]:
# get list of teams,
# iterate through each
# insert summary statistics for each team

teamNames = df["HomeTeam"].unique()
for i in teamNames:
    j = matches[ matches['HomeTeam'].str.match(i) | matches['AwayTeam'].str.match(i) ].copy()
    k = team(i, j)
    fb_collection.insert_one({"name": k.name, "goals": int(k.goals), "wins": int(k.wins), 
        "win% (rain)": round(k.rain, 2), "plot": list(k.getPlot(k.df, k.name)[0])})


In [14]:
# verify entries
[i for i in fb_collection.find()]

[{'_id': ObjectId('5df121f22a9079d0f7b50e80'),
  'name': 'Nurnberg',
  'goals': 38,
  'wins': 15,
  'win% (rain)': 0.5,
  'plot': [2.0, 1.0, 5.0, 8.0, 6.0, 8.0, 4.0]},
 {'_id': ObjectId('5df121f22a9079d0f7b50e81'),
  'name': 'Stuttgart',
  'goals': 63,
  'wins': 17,
  'win% (rain)': 0.22,
  'plot': [1.0, 5.0, 5.0, 8.0, 5.0, 3.0, 5.0, 2.0]},
 {'_id': ObjectId('5df121f22a9079d0f7b50e82'),
  'name': 'Wolfsburg',
  'goals': 47,
  'wins': 21,
  'win% (rain)': 0.38,
  'plot': [2.0, 3.0, 6.0, 5.0, 5.0, 8.0, 5.0]},
 {'_id': ObjectId('5df121f22a9079d0f7b50e83'),
  'name': 'Mainz',
  'goals': 47,
  'wins': 13,
  'win% (rain)': 0.27,
  'plot': [1.0, 2.0, 4.0, 6.0, 12.0, 3.0, 3.0, 3.0]},
 {'_id': ObjectId('5df121f22a9079d0f7b50e84'),
  'name': 'Freiburg',
  'goals': 45,
  'wins': 15,
  'win% (rain)': 0.22,
  'plot': [1.0, 0.0, 0.0, 2.0, 2.0, 3.0, 6.0, 10.0, 5.0, 5.0]},
 {'_id': ObjectId('5df121f22a9079d0f7b50e85'),
  'name': "M'gladbach",
  'goals': 49,
  'wins': 16,
  'win% (rain)': 0.46,
  'plot

In [15]:
# uncomment if we need to delete our collection for clean results.
# fb_collection.delete_many({})