# College Sports Expenses, Revenue, and Coaching Staff

In [1]:
# Packages used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymysql as mysql
import sqlalchemy 
from prometheus_client import start_http_server, Gauge
import time
import logging

import warnings
warnings.filterwarnings('ignore')

## Import datasets

In [25]:
expenses_df = pd.read_csv('Expenses.csv')
revenue_df = pd.read_csv('Revenue.csv')
coaching_staff_df = pd.read_csv('Coaching_Staff.csv')

## Data Exploration and Pre-processing

In [26]:
expenses_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Baseball Total Expenses,Basketball Total Expenses,All Track Combined Total Expenses,Football Total Expenses,Golf Total Expenses,Gymnastics Total Expenses,Soccer Total Expenses,Softball Total Expenses,Tennis Total Expenses,Volleyball Total Expenses,Grand Total Expenses
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,4946,222504.0,1092504,296168.0,1913709.0,60846.0,,407384.0,182237.0,119255.0,262891.0,6738484
1,2003,100724,100500,Alabama State University,AL,1772,2482,4254,152444.0,1127382,139102.0,1793923.0,101943.0,,,134663.0,120539.0,130397.0,4163344
2,2003,175342,239600,Alcorn State University,MS,952,1406,2358,132373.0,528424,321974.0,857105.0,81516.0,,73828.0,141603.0,89680.0,98879.0,2886975


In [27]:
revenue_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Baseball Total Revenue,Basketball Total Revenue,All Track Combined Total Revenue,Football Total Revenue,Golf Total Revenue,Gymnastics Total Revenue,Soccer Total Revenue,Softball Total Revenue,Tennis Total Revenue,Volleyball Total Revenue,Grand Total Revenue
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,4946,222503.0,1230648,296167.0,2475828.0,60847.0,,409882.0,182238.0,119256.0,260491.0,8521205
1,2003,100724,100500,Alabama State University,AL,1772,2482,4254,136998.0,1035692,105654.0,2025850.0,118442.0,,,146215.0,100389.0,131968.0,3860658
2,2003,175342,239600,Alcorn State University,MS,952,1406,2358,4799.0,74917,11656.0,244346.0,2728.0,,2976.0,3968.0,3720.0,3224.0,2538138


In [28]:
coaching_staff_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Men's Team Average Annual Institutional Salary per Head Coach,Men's Team Number of Head Coaches Included in Average,Women's Team Average Annual Institutional Salary per Head Coach,Women's Team Number of Head Coaches Included in Average
0,2003,100654,100200,Alabama A & M University,AL,1936,2856,4792,52248,7,43763,6
1,2003,100724,100500,Alabama State University,AL,1095,1947,3042,354473,5,21064,7
2,2003,175342,239600,Alcorn State University,MS,690,1327,2017,27395,6,21510,7


In [29]:
# Remove unneccesary columns
expenses_df = expenses_df.drop(['Total Undergraduates', 'Baseball Total Expenses', 'All Track Combined Total Expenses', 'Golf Total Expenses', 'Gymnastics Total Expenses', 'Softball Total Expenses', 'Tennis Total Expenses', 'Volleyball Total Expenses'], axis=1)
revenue_df = revenue_df.drop(['Total Undergraduates', 'Baseball Total Revenue', 'All Track Combined Total Revenue', 'Golf Total Revenue', 'Gymnastics Total Revenue', 'Softball Total Revenue', 'Tennis Total Revenue', 'Volleyball Total Revenue'], axis=1)
coaching_staff_df = coaching_staff_df.drop(['Total Undergraduates'], axis=1)

In [30]:
# Column data types
print("Expenses:\n", expenses_df.dtypes)
print("\nRevenue:\n", revenue_df.dtypes)
print("\nCoaching Staff:\n", coaching_staff_df.dtypes)

Expenses:
 Survey Year                    int64
UNITID                         int64
OPE ID                         int64
Institution Name              object
State CD                      object
Male Undergraduates            int64
Female Undergraduates          int64
Basketball Total Expenses      int64
Football Total Expenses      float64
Soccer Total Expenses        float64
Grand Total Expenses           int64
dtype: object

Revenue:
 Survey Year                   int64
UNITID                        int64
OPE ID                        int64
Institution Name             object
State CD                     object
Male Undergraduates           int64
Female Undergraduates         int64
Basketball Total Revenue      int64
Football Total Revenue      float64
Soccer Total Revenue        float64
Grand Total Revenue           int64
dtype: object

Coaching Staff:
 Survey Year                                                          int64
UNITID                                                

In [31]:
# Missing values
print("Expenses:\n", expenses_df.isnull().sum())
print("\nRevenue:\n", revenue_df.isnull().sum())
print("\nCoaching Staff:\n", coaching_staff_df.isnull().sum())

Expenses:
 Survey Year                    0
UNITID                         0
OPE ID                         0
Institution Name               0
State CD                       0
Male Undergraduates            0
Female Undergraduates          0
Basketball Total Expenses      0
Football Total Expenses       63
Soccer Total Expenses        236
Grand Total Expenses           0
dtype: int64

Revenue:
 Survey Year                   0
UNITID                        0
OPE ID                        0
Institution Name              0
State CD                      0
Male Undergraduates           0
Female Undergraduates         0
Basketball Total Revenue      0
Football Total Revenue       63
Soccer Total Revenue        243
Grand Total Revenue           0
dtype: int64

Coaching Staff:
 Survey Year                                                         0
UNITID                                                              0
OPE ID                                                              0
Instituti

In [32]:
# Check for duplicate rows
print("Expenses:\n", expenses_df.duplicated().sum())
print("\nRevenue:\n", revenue_df.duplicated().sum())
print("\nCoaching Staff:\n", coaching_staff_df.duplicated().sum())

Expenses:
 0

Revenue:
 0

Coaching Staff:
 0


## Transformations

In [33]:
# Rename columns for index clarity and readibility
expenses_df.rename(columns={'UNITID': 'unit id', 'State CD': 'state'}, inplace=True)
revenue_df.rename(columns={'UNITID': 'unit id', 'State CD': 'state'}, inplace=True)
coaching_staff_df.rename(columns={"UNITID": "unit id",
                                  "State CD": "state",
                                  "Men's Team Average Annual Institutional Salary per Head Coach": "m team avg coach salary", 
                                  "Men's Team Number of Head Coaches Included in Average": "m team coach count",
                                  " Women's Team Average Annual Institutional Salary per Head Coach": "w team avg coach salary",
                                  "Women's Team Number of Head Coaches Included in Average": "w team coach count"},
                                  inplace=True)

In [34]:
# Convert data types
# expenses_df['Institution Name'] = expenses_df['Institution Name'].astype(str)
# expenses_df['State CD'] = expenses_df['State CD'].astype(str)
# revenue_df['Institution Name'] = revenue_df['Institution Name'].astype(str)
# revenue_df['State CD'] = revenue_df['State CD'].astype(str)
# coaching_staff_df['Institution Name'] = coaching_staff_df['Institution Name'].astype(str)

In [35]:
# Column name cleanup
expenses_df.columns = expenses_df.columns.str.lower().str.replace(" ", "_")
revenue_df.columns = revenue_df.columns.str.lower().str.replace(" ", "_")
coaching_staff_df.columns = coaching_staff_df.columns.str.lower().str.replace(" ", "_")

In [36]:
# Fill missing values rows
expenses_df.fillna(0, inplace=True)
revenue_df.fillna(0, inplace=True)

In [37]:
# Join Expenses and Revenue tables to show net profit
net_profit_df = pd.merge(revenue_df, expenses_df, on=['unit_id', 'institution_name'], how='inner')
net_profit_df = pd.merge(net_profit_df, coaching_staff_df, on=['unit_id', 'institution_name'], how='left')

# Compute profit (revenue - expenses)
net_profit_df['profit'] = net_profit_df[['basketball_total_revenue', 'football_total_revenue', 'soccer_total_revenue']].sum(axis=1) - net_profit_df[['basketball_total_expenses', 'football_total_expenses', 'soccer_total_expenses']].sum(axis=1)

# Compute total coaching salaries
net_profit_df['total_coach_salary'] = net_profit_df['m_team_avg_coach_salary'] + net_profit_df['w_team_avg_coach_salary']

# Compute net profit accounting for coaches' salaries
net_profit_df['profit_coaching'] = net_profit_df['profit'] - net_profit_df['total_coach_salary']

## Net profit table

In [38]:
net_profit_df.head(3)

Unnamed: 0,survey_year_x,unit_id,ope_id_x,institution_name,state_x,male_undergraduates_x,female_undergraduates_x,basketball_total_revenue,football_total_revenue,soccer_total_revenue,...,state,male_undergraduates,female_undergraduates,m_team_avg_coach_salary,m_team_coach_count,w_team_avg_coach_salary,w_team_coach_count,profit,total_coach_salary,profit_coaching
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,1230648,2475828.0,409882.0,...,AL,1936.0,2856.0,52248.0,7.0,43763.0,6.0,702761.0,96011.0,606750.0
1,2003,100654,100200,Alabama A & M University,AL,2362,2584,1230648,2475828.0,409882.0,...,AL,1936.0,2856.0,55382.0,9.0,46388.0,10.0,702761.0,101770.0,600991.0
2,2003,100654,100200,Alabama A & M University,AL,2362,2584,1230648,2475828.0,409882.0,...,AL,1936.0,2856.0,45914.0,7.0,31047.0,8.0,702761.0,76961.0,625800.0


## View cleaned datasets

In [39]:
expenses_df.head(3)

Unnamed: 0,survey_year,unit_id,ope_id,institution_name,state,male_undergraduates,female_undergraduates,basketball_total_expenses,football_total_expenses,soccer_total_expenses,grand_total_expenses
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,1092504,1913709.0,407384.0,6738484
1,2003,100724,100500,Alabama State University,AL,1772,2482,1127382,1793923.0,0.0,4163344
2,2003,175342,239600,Alcorn State University,MS,952,1406,528424,857105.0,73828.0,2886975


In [40]:
revenue_df.head(3)

Unnamed: 0,survey_year,unit_id,ope_id,institution_name,state,male_undergraduates,female_undergraduates,basketball_total_revenue,football_total_revenue,soccer_total_revenue,grand_total_revenue
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,1230648,2475828.0,409882.0,8521205
1,2003,100724,100500,Alabama State University,AL,1772,2482,1035692,2025850.0,0.0,3860658
2,2003,175342,239600,Alcorn State University,MS,952,1406,74917,244346.0,2976.0,2538138


In [41]:
coaching_staff_df.head(3)

Unnamed: 0,survey_year,unit_id,ope_id,institution_name,state,male_undergraduates,female_undergraduates,m_team_avg_coach_salary,m_team_coach_count,w_team_avg_coach_salary,w_team_coach_count
0,2003,100654,100200,Alabama A & M University,AL,1936,2856,52248,7,43763,6
1,2003,100724,100500,Alabama State University,AL,1095,1947,354473,5,21064,7
2,2003,175342,239600,Alcorn State University,MS,690,1327,27395,6,21510,7


## Connect to MySQL Server

In [42]:
# Login to MySQL
userName = 'YourUsername'
userPass = 'YourPassword'
conn = mysql.Connect(
    host = 'localhost', 
    port = int(3306), 
    user = userName, 
    passwd = userPass, 
)

OperationalError: (1045, "Access denied for user 'YourUsername'@'localhost' (using password: YES)")

### Create Sports Database and tables

In [43]:
conn.cursor().execute("CREATE DATABASE IF NOT EXISTS ADS_507_sports")

# Open to Sports database
conn.select_db("ADS_507_sports")

NameError: name 'conn' is not defined

In [225]:
# Create expenses table in database
create_expenses_table = """
    CREATE TABLE expenses (
        survey_year SMALLINT, 
        unit_id SMALLINT, 
        ope_id SMALLINT, 
        institution_name VARCHAR(100), 
        state CHAR(2), 
        male_undergraduates SMALLINT, 
        female_undergraduates SMALLINT, 
        basketball_total_expenses INT, 
        football_total_expenses INT, 
        soccer_total_expenses INT, 
        grand_total_expenses INT, 
        PRIMARY KEY (unit_id, ope_id), 
        INDEX (unit_id), 
        INDEX (ope_id)
    )
    """
conn.cursor().execute(create_expenses_table)   

0

In [228]:
# Create revenue table in database
create_revenue_table = """
    CREATE TABLE revenue (
        survey_year SMALLINT, 
        unit_id SMALLINT, 
        ope_id SMALLINT, 
        institution_name VARCHAR(100), 
        state CHAR(2), 
        male_undergraduates SMALLINT, 
        female_undergraduates SMALLINT, 
        basketball_total_revenue INT, 
        football_total_revenue INT, 
        soccer_total_revenue INT, 
        grand_total_revenue INT, 
        PRIMARY KEY (unit_id, ope_id),
        FOREIGN KEY (unit_id, ope_id) REFERENCES expenses(unit_id, ope_id), 
        INDEX (institution_name)
    )
    """
conn.cursor().execute(create_revenue_table)

0

In [229]:
# Create coaching staff table in database
create_coaching_staff_table = """
    CREATE TABLE coaching_staff (
        survey_year SMALLINT, 
        unit_id SMALLINT, 
        ope_id SMALLINT, 
        institution_name VARCHAR(100), 
        state CHAR(2),
        male_undergraduates SMALLINT, 
        female_undergraduates SMALLINT, 
        m_team_avg_coach_salary INT, 
        m_team_coach_count SMALLINT, 
        w_team_avg_coach_salary INT, 
        w_team_coach_count SMALLINT, 
        PRIMARY KEY (unit_id, ope_id), 
        FOREIGN KEY (unit_id, ope_id) REFERENCES expenses(unit_id, ope_id)
    )
    """
conn.cursor().execute(create_coaching_staff_table)

0

In [None]:
# Create table for net profit for each college and sport

### Load dataframes into MySQL tables