# Data Analysis

This document describes the different methods and techniques we used in our analysis of our data.

## Main Hypothesis

As the number of players drafted to the NFL goes up the number of total fantasy point goes up for a college.

### Method

To test this we grouped the data by college and made a scatter plot that had the x axis as the total number of points and the 
y axis as number of players each college had from 2000-2016. We were able to prove that yes as the number of players draft to the NFL goes up, the more fantasy points that college had. Through this graphing method we were also able to plot a linear regression line and come up with a point per player.

In [None]:
import numpy as np
import pandas as pd
import os
import csv
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [6]:
fantasy_f = "actually_final_i_promise.csv"
df = pd.read_csv(fantasy_f)

#plotting graph without regression
result = {}
for each in df["college"].unique():
    result[each] = {}
    ball = df.loc[df["college"] == each]
    result[each]["playercount"] = len(ball["name"].unique())
    result[each]["average score"] = ball["total annual points"].sum()
result = pd.DataFrame(result).transpose()
result.rename(columns={'average score': 'total points'}, inplace=True)
result.loc[result["playercount"]>30]

x = result["average score"]
y = result["playercount"]
with plt.style.context("seaborn"):
    plt.figure(figsize=(8,8))
    plt.scatter(x, y, c='steelblue', edgecolor='white', s=70)
    plt.ylabel("Player College Count")
    plt.xlabel("Player Fantasy Point Total")
    plt.title("College Player Counts vs. Total Fantasy Points (2000-2017)")
    plt.savefig("FF Scatter.png")


NameError: name 'pd' is not defined

In [7]:
# plotting with regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
with plt.style.context("seaborn"):
    plt.figure(figsize=(8,8))
    plt.scatter(X, y, c='steelblue', edgecolor='white', s=70)
    plt.plot(X, model.predict(X), color='black', lw=2)
    plt.ylabel("Player College Count")
    plt.xlabel("Player Fantasy Point Total")
    plt.title("College Player Counts vs. Total Fantasy Points (2000-2017)")
    plt.savefig("FF Scatter Regression.png")

m = model.coef_[0]
print("slope=",m)

NameError: name 'X' is not defined

In [8]:
# slope= 0.0038366969217760714
# 1 player is equal to about 260 pts. over this timespan.

## Analysis of the Power 5 Conferences

In order to see which NCAA major conference contributed the most points, we took just the schools in the power 5 and sorted 
by conference. We then broke each conferences points down by position and created a stacked bar chart from the data. We were
able to tell that the Big 10 contributes the most fantasy points to the NFL, while the ACC was in close second. Unfortunately,
the Big 12 was the lowest out of the five.

In [None]:
switcher = json.load(open("power 5 schools.txt"))
# this cell reserved for testing
with open("actually_final_i_promise.csv") as inbox:
    DF = pandas.read_csv(inbox)

In [None]:
def setup(df, x, y, function):
    """Takes a dataframe, two column names from it and a lambda function to be applied to a loc of the dataframe to return an int. returns a dataframe set up per spec"""
    result = {}
    for each in df[x].unique():
        result[each] = {}
        for other in df[y].unique():
            ball = df.loc[df[x] == each]
            ball = ball.loc[ball[y] == other]
            result[each][other] = function(ball)
    return pandas.DataFrame(result)

# this cell reserved for testing
ball = DF
def throw(inbox):
    if inbox in switcher:
        return switcher[inbox]
    else:
        return "other"
ball["conference"] = ball["college"].map(throw)
ball = setup(ball, "conference", "Position", lambda x: x["total annual points"].sum()).transpose()
ball = ball.drop("other")


In [None]:
last = pandas.Series(index = ball.index, data = [0])
width = .35
ind = range(len(ball.index))
plt.figure(figsize = (22, 10))
for position in ball.columns:
    plt.bar(ind, list(ball[position]), bottom=last)
    last = ball[position].add(last)
plt.xticks(ind, ball.index)
plt.ylabel('Total Annual Points')
plt.title('Total Annual Points by Division and Position')
plt.style.use('seaborn')
plt.legend(ball.columns)
plt.savefig("stacked_bar.png")
plt.show()

ACC        63149.02
Big 10     71534.00
Big 12     31747.16
PAC12      53852.86
SEC        37352.08
other     213424.00

## Yearly Look at Total Fantasy Point Generation by Position

To check for anything that might be throwing off our average per year counts, we made a histogram of the total amount of points
generated by each position for the year.

In [10]:
with open("actually_final_i_promise.csv") as inbox:
    DF = pandas.read_csv(inbox)

NameError: name 'pandas' is not defined

In [None]:
def setup(df, x, y, function):
    """Takes a dataframe, two column names from it and a lambda function to be applied to a loc of the dataframe to return an int. returns a dataframe set up per spec"""
    result = {}
    for each in df[x].unique():
        result[each] = {}
        for other in df[y].unique():
            ball = df.loc[df[x] == each]
            ball = ball.loc[ball[y] == other]
            result[each][other] = function(ball)
    return pandas.DataFrame(result)

def pretty_picture(df, x_title, y_title, title):
    """Takes a dataframe of the format we're using for our results. sets it up as a matplotlib plot. returns True if successful"""
    colorlist = ["#e6194b", "#3cb44b", "#0082c8", "#f58231", "#911eb4", "#46f0f0", "#f032e6", "#ffe119", "#d2f53c", "#fabebe", "#000080", "#ffd8b1", "#808000", "#aaffc3", "#800000", "#fffac8", "#aa6e28", "#e6beff", "#008080"]
    markerlist = [".", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s" "p", "P", "*", ",", "h", "H", "+", "x", "X", "D", "d", "|", "_"]
    ball = []
    plt.figure(figsize = (22, 10))
    for each in range(len(df.columns)):
        ball.append(plt.errorbar(df.index, df[df.columns[each]], label = df.columns[each], color = colorlist[each], marker = markerlist[each]))
    plt.xticks(df.index)
    plt.legend(loc = "best")
    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.title(title)
    plt.grid(alpha = .25)
    
test_data = setup(DF, "Position", "Year", lambda x: x["total annual points"].sum())
pretty_picture(test_data, "Year", "Fantasy Point Totals", "Yearly Look at Total Fantasy Point Generation by Position")

fig1 = plt.gcf()
plt.show()
plt.draw()
fig1.savefig('Yearly Total Fantasy Point Generation by Position.png')

## Yearly Average Points by Position

Next we wanted to see by year which positions scored the highest amount of points on average each year. We used the following
code to do so:

In [None]:
test_data = setup(DF, "Position", "Year", lambda x: (x["total annual points"].mean()/x["Games_Played"].mean()))
pretty_picture(test_data, "Year", "Fantasy Points", "Yearly Average Points Per Position")

plt.style.use('seaborn')

fig1 = plt.gcf()
plt.show()
plt.draw()
fig1.savefig('Yearly Average Points Per Position.png')

## Top Points by College and Position

Finally we wanted to break down which college had the most post for each of our major positions. We used the code below to 
generate charts for each type: 

In [None]:
CSV = "actually_final_i_promise.csv"

with open(CSV, "r") as inbox:
    # DATA is the original data
    df = pandas.read_csv(inbox)
    
df = df.dropna(axis=0, subset=college)

result = {}
for each in df["college"].unique():
    result[each] = {}
    ball = df.loc[df["college"] == each]
    result[each]["playercount"] = len(ball["name"].unique())
    result[each]["average score"] = ball["total annual points"].sum() / result[each]["playercount"]
result = pandas.DataFrame(result).transpose()
df['college'] = (df['college'].astype(str).str.lower())

qb_df = pandas.DataFrame(df[df['Position'] == 'QB'])
rb_df = pandas.DataFrame(df[df['Position'] == 'RB'])
te_df = pandas.DataFrame(df[df['Position'] == 'TE'])
wr_df = pandas.DataFrame(df[df['Position'] == 'WR'])
fb_df = pandas.DataFrame(df[df['Position'] == 'FB'])

qb = pandas.DataFrame(qb_df.groupby(['college'])['total annual points'].sum().sort_values(0, ascending=False).nlargest(5))
rb = pandas.DataFrame(rb_df.groupby(['college'])['total annual points'].sum().sort_values(0, ascending=False).nlargest(5))
wr = pandas.DataFrame(wr_df.groupby(['college'])['total annual points'].sum().sort_values(0, ascending=False).nlargest(5))
te = pandas.DataFrame(te_df.groupby(['college'])['total annual points'].sum().sort_values(0, ascending=False).nlargest(5))
fb = pandas.DataFrame(fb_df.groupby(['college'])['total annual points'].sum().sort_values(0, ascending=False).nlargest(5))

In [None]:
# QB
plt.style.use('seaborn')
plt.bar(qb.index, qb['total annual points'], color='red')
plt.title('Top Fantasy Points by College:  QB')
plt.xlabel('College')
plt.ylabel('Total points from 2000-2016')
plt.ylim(0,6000)
plt.savefig('top points by college-position QB.png')
plt.show()

In [None]:
# RB
plt.bar(rb.index, rb['total annual points'], color='blue')
plt.title('Top Fantasy Points by College: Running Backs')
plt.xlabel('College')
plt.ylabel('Total points from 2000-2016')
plt.ylim(0,6000)
plt.savefig('top points by college-position RB.png')
plt.show()

In [None]:
# TE
plt.bar(te.index, te['total annual points'], color='green')
plt.title('Top Fantasy Points by College: Tight End')
plt.xlabel('College')
plt.ylabel('Total points from 2000-2016')
plt.ylim(0,6000)
plt.savefig('top points by college-position TE.png')
plt.show()

In [None]:
# WR
plt.bar(wr.index, wr['total annual points'], color='orange')
plt.title('Top Fantasy Points by College: Wide Receiver')
plt.xlabel('College')
plt.ylabel('Total points from 2000-2016')
plt.ylim(0,6000)
plt.savefig('top points by college-position WR.png')
plt.show()

In [None]:
# FB
plt.bar(fb.index, fb['total annual points'], color='purple')
plt.title('Top Fantasy Points by College: Full Back')
plt.xlabel('College')
plt.ylabel('Total points from 2000-2016')
plt.ylim(0,6000)
plt.savefig('top points by college-position FB.png')
plt.show()