# College Sports Expenses, Revenue, and Coaching Staff

In [1]:
# Packages used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymysql as mysql

import warnings
warnings.filterwarnings('ignore')

## Import CSV files

In [25]:
expenses_df = pd.read_csv('Expenses.csv')
revenue_df = pd.read_csv('Revenue.csv')
coaching_staff_df = pd.read_csv('Coaching_Staff.csv')

## Data Exploration and Pre-processing

In [14]:
expenses_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Baseball Total Expenses,Basketball Total Expenses,All Track Combined Total Expenses,Football Total Expenses,Golf Total Expenses,Gymnastics Total Expenses,Soccer Total Expenses,Softball Total Expenses,Tennis Total Expenses,Volleyball Total Expenses,Grand Total Expenses
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,4946,222504.0,1092504,296168.0,1913709.0,60846.0,,407384.0,182237.0,119255.0,262891.0,6738484
1,2003,100724,100500,Alabama State University,AL,1772,2482,4254,152444.0,1127382,139102.0,1793923.0,101943.0,,,134663.0,120539.0,130397.0,4163344
2,2003,175342,239600,Alcorn State University,MS,952,1406,2358,132373.0,528424,321974.0,857105.0,81516.0,,73828.0,141603.0,89680.0,98879.0,2886975


In [15]:
revenue_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Baseball Total Revenue,Basketball Total Revenue,All Track Combined Total Revenue,Football Total Revenue,Golf Total Revenue,Gymnastics Total Revenue,Soccer Total Revenue,Softball Total Revenue,Tennis Total Revenue,Volleyball Total Revenue,Grand Total Revenue
0,2003,100654,100200,Alabama A & M University,AL,2362,2584,4946,222503.0,1230648,296167.0,2475828.0,60847.0,,409882.0,182238.0,119256.0,260491.0,8521205
1,2003,100724,100500,Alabama State University,AL,1772,2482,4254,136998.0,1035692,105654.0,2025850.0,118442.0,,,146215.0,100389.0,131968.0,3860658
2,2003,175342,239600,Alcorn State University,MS,952,1406,2358,4799.0,74917,11656.0,244346.0,2728.0,,2976.0,3968.0,3720.0,3224.0,2538138


In [16]:
coaching_staff_df.head(3)

Unnamed: 0,Survey Year,UNITID,OPE ID,Institution Name,State CD,Male Undergraduates,Female Undergraduates,Total Undergraduates,Men's Team Average Annual Institutional Salary per Head Coach,Men's Team Number of Head Coaches Included in Average,Women's Team Average Annual Institutional Salary per Head Coach,Women's Team Number of Head Coaches Included in Average
0,2003,100654,100200,Alabama A & M University,AL,1936,2856,4792,52248,7,43763,6
1,2003,100724,100500,Alabama State University,AL,1095,1947,3042,354473,5,21064,7
2,2003,175342,239600,Alcorn State University,MS,690,1327,2017,27395,6,21510,7


In [27]:
# Column data types
print("Expenses:\n", expenses_df.dtypes)
print("\nRevenue:\n", revenue_df.dtypes)
print("\nCoaching Staff:\n", coaching_staff_df.dtypes)

Expenses:
 Survey Year                    int64
UNITID                         int64
Institution Name              object
State CD                      object
Male Undergraduates            int64
Female Undergraduates          int64
Basketball Total Expenses      int64
Football Total Expenses      float64
Soccer Total Expenses        float64
Grand Total Expenses           int64
dtype: object

Revenue:
 Survey Year                   int64
UNITID                        int64
Institution Name             object
State CD                     object
Male Undergraduates           int64
Female Undergraduates         int64
Basketball Total Revenue      int64
Football Total Revenue      float64
Soccer Total Revenue        float64
Grand Total Revenue           int64
dtype: object

Coaching Staff:
 Survey Year                                                          int64
UNITID                                                               int64
Institution Name                                    

In [30]:
# Missing values
print("Expenses:\n", expenses_df.isnull().sum())
print("\nRevenue:\n", revenue_df.isnull().sum())
print("\nCoaching Staff:\n", coaching_staff_df.isnull().sum())

Expenses:
 survey_year                    0
unit_id                        0
institution_name               0
state_cd                       0
male_undergraduates            0
female_undergraduates          0
basketball_total_expenses      0
football_total_expenses       63
soccer_total_expenses        236
grand_total_expenses           0
dtype: int64

Revenue:
 survey_year                   0
unit_id                       0
institution_name              0
state_cd                      0
male_undergraduates           0
female_undergraduates         0
basketball_total_revenue      0
football_total_revenue       63
soccer_total_revenue        243
grand_total_revenue           0
dtype: int64

Coaching Staff:
 survey_year                                                         0
unit_id                                                             0
institution_name                                                    0
state_cd                                                            0
male

In [32]:
# Remove rows with missing values
expenses_df = expenses_df.dropna(inplace=True)
revenue_df = revenue_df.dropna(inplace=True)
coaching_staff_df = coaching_staff_df.dropna(inplace=True)

In [31]:
# Duplicate rows
print("Expenses:\n", expenses_df.duplicated().sum())
print("\nRevenue:\n", revenue_df.duplicated().sum())
print("\nCoaching Staff:\n", coaching_staff_df.duplicated().sum())

Expenses:
 0

Revenue:
 0

Coaching Staff:
 0


## Transformations

In [26]:
# Remove unneccesary columns
expenses_df = expenses_df.drop(['OPE ID', 'Total Undergraduates', 'Baseball Total Expenses', 'All Track Combined Total Expenses', 'Golf Total Expenses', 'Gymnastics Total Expenses', 'Softball Total Expenses', 'Tennis Total Expenses', 'Volleyball Total Expenses'], axis=1)
revenue_df = revenue_df.drop(['OPE ID', 'Total Undergraduates', 'Baseball Total Revenue', 'All Track Combined Total Revenue', 'Golf Total Revenue', 'Gymnastics Total Revenue', 'Softball Total Revenue', 'Tennis Total Revenue', 'Volleyball Total Revenue'], axis=1)
coaching_staff_df = coaching_staff_df.drop(['OPE ID', 'Total Undergraduates'], axis=1)

In [28]:
# Rename UNITID for index clarity and shorten column names
expenses_df.rename(columns={'UNITID': 'Unit_id'}, inplace=True)
revenue_df.rename(columns={'UNITID': 'Unit_id'}, inplace=True)
coaching_staff_df.rename(columns={'UNITID': 'Unit_id', 
                                  "Men's Team Average Annual Institutional Salary per Head Coach": 'Men_team_avg_coach_salary', 
                                  "Men's Team Number of Head Coaches Included in Average": 'Men_team_coach_count',
                                  "Women's Team Average Annual Institutional Salary per Head Coach": 'Women_team_avg_coach_salary',
                                  "Women's Team Number of Head Coaches Included in Average": 'Women_team_coach_count'},
                                  inplace=True)

In [29]:
# Column name cleanup
expenses_df.columns = expenses_df.columns.str.lower().str.replace(" ", "_")
revenue_df.columns = revenue_df.columns.str.lower().str.replace(" ", "_")
coaching_staff_df.columns = coaching_staff_df.columns.str.lower().str.replace(" ", "_")

In [None]:
# Convert data types

## Connect to MySQL Server

In [None]:
userName = ''
userPass = ''
conn = mysql.Connect(
    host = 'localhost', 
    port = int(3306), 
    user = userName, 
    passwd = userPass, 
    db = 'SAKILA'
)