# INSY 6800 - EDA Project 

Project by Richie Wilbanks and Carly Walker

Project Description: Display EDA skills learned in INSY 6800 course by using a real world data set. The data set explored displays statistics for D1 NCAA Men's College Basketball teams ranging from 2010 to 2025. 

## Initial Data Setup 

In this section the data is imported from github, cleaned, and formatted into a dataframe so that the team can use the data for exploration. 

In [None]:
!pip install openpyxl
#Install to reach excel files 

In [None]:
#import pandas to handle DataFrames and Path for importing dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

#join current working directory to dataset
Project_Root = Path(".")
Data_Dir = Project_Root / "sports_data"

#import and sort excel files from dataset
excel_files = sorted(Data_Dir.glob("*.xls*"))

In [None]:
#Use function to load excel data for all seasons
def clean_season(path, season_name):

    # returns filename extension (.xls or .xlsx)
    ext = path.suffix.lower()

    # Run loop so function can support different file topes 
    if ext == ".xls":
        tables = pd.read_html(path, header=[0, 1])
        raw = tables[0]
    elif ext == ".xlsx":
        raw = pd.read_excel(path, header=[0, 1], engine="openpyxl")
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    #Flatten column names 
    new_cols = []
    for col in raw.columns:
        # col is a tuple (ex: ("Overall", "W")
        if isinstance(col, tuple):
            top = str(col[0])
            sub = str(col[1])
        else:
            top = ""
            sub = str(col)

        # Cleaning data 
            #if school has no name
        if top.startswith("Unnamed") and sub == "School":
            name = "School"
        elif top == "Overall":
            # Tuples start with "Overall" but we want the sub header (ex: W, L, G)
            name = sub
        elif top == "Totals":
            # Tupes start with "Total" but we want sub header (ex: FG, 3P)
            name = sub
        elif top == "Points" and sub == "Tm.":
            #Make more readable 
            name = "PTS"
        elif top == "Points" and sub == "Opp.":
            #Make more readable 
            name = "Opp PTS"
        else:
            # For Conf., Home, Away, etc., create a unique name
            # that won't match keep_cols (so we ignore them)
            name = f"{top}_{sub}".strip()

        new_cols.append(name)

    raw.columns = new_cols

    #drop any completely empty columns
    raw = raw.dropna(axis=1, how="all")

    #Keep only wanted stats 
    keep_cols = [
        "School",
        "G", "W", "L", "W-L%", "SRS", "SOS",
        "PTS", "Opp PTS",
        "FG", "FGA", "FG%",
        "3P", "3PA", "3P%",
        "FT", "FTA", "FT%",
        "TRB", "AST", "STL", "BLK", "TOV"
    ]

    existing_cols = [col for col in keep_cols if col in raw.columns]
    missing_cols = [col for col in keep_cols if col not in raw.columns]

    #warning if column is missing 
    if missing_cols:
        print(f"For {season_name}, missing columns (skipped): {missing_cols}")

    df = raw[existing_cols].copy()

    #Remove duplicates 
    df = df.loc[:, ~df.columns.duplicated()]

    # Clean school names 
    if "School" in df.columns:
        df["School"] = (
            df["School"]
            .astype(str)
            .str.replace(r"\s*NCAA$", "", regex=True)
            .str.replace(r"\s*\(.*?\)", "", regex=True)
            .str.strip()
        )

    # Convert all numeric columns 
    for col in df.columns:
        if col != "School":
            #Ensures we are working with a series and not a data frame
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Add a season column
    df["Season"] = season_name

    return df

In [None]:
#Creates a dictionary with the season name (ex: 2010-2011) as key name and values as the dataframe from above 
season_dfs = {}

#Loop through excel files and strip so season name is the years 
for file in excel_files:
    name = file.stem
    season_name = name.split("_")[-1]  # ex: 2024-2025

    df_season = clean_season(file, season_name) # use fuction from above 
    season_dfs[season_name] = df_season

    #confirm the season was loaded 
    print("Loaded season:", season_name, "Shape:", df_season.shape)


In [None]:
#concatenate all cleaned seasons - create one data frame with all seasons data 
df_all = pd.concat(season_dfs.values(), ignore_index=True)
# remove rows with missing values
df_all = df_all.dropna().reset_index(drop=True)

## Question 1: Which team statistics have the strongest relationship with winning percentage? 

This question hopes to expose which factors have the strongest correlation with winning. For example, are teams with higher blocking stats winning more games?

In [None]:
#remove the columns that are win/lose percentages
exclude = ["W", "L", "W-L%"]

# Keep only stat columns (only numerical columns)
numeric_cols = [
    col for col in df_all.select_dtypes(include="number").columns
    if col not in exclude
]

correlation_data = df_all[numeric_cols].corrwith(df_all["W-L%"]).sort_values()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=correlation_data.values, y=correlation_data.index, palette="coolwarm")
plt.title("Which Team Statistics Correlate Most With Win Percentage?")
plt.xlabel("Correlation with Win %")
plt.ylabel("Statistic")
plt.show()

In [None]:
key_stats = ["SRS", "FG%", "PTS", "TOV", "Opp PTS", "FT%"]  # bottom 3 and top three stats

plt.figure(figsize=(18,10))

for i, stat in enumerate(key_stats, 1):
    plt.subplot(2, 3, i)  
    sns.scatterplot(data=df_all, x=stat, y="W-L%", alpha=0.4)
    sns.regplot(data=df_all, x=stat, y="W-L%", scatter=False, color="red")
    plt.title(f"{stat} vs Win %")

plt.tight_layout()
plt.show()

### Takeways: 

To find answers to the given question, "Which statistics best explain why certain teams win more games?", I first computed the correlations between each team stat and the teams win percentage. The first iteration of my graph included W, L, and W-L% but these factors directly showed wins and loses so I removed these from the analysis. This left the remaining numerical statistics which revealed certain factors as positive and negative associations with winning. For example, turnovers correlated negatively with win percentage and SRS (simple rating system), FG%, and total points scored all correlated positively. This shows that both offense and defense performance play a large role in helping a team succeed. 


Next, I graphed six of the key metrics: the 3 metrics on the bottom and top of the correlation list to show how the strongest psotive and negative correlations appear across all teams and seasons. The six scatterplots above show that a higher SRS, FG% and points scored all align with stronger win percentages and turnovers and opponents points trend downwards. This can help coaches and players to become aligned on there teams vision and find specific areas to focus on when preparing and playing. 

## Question 2: Are statistics shifting over time?

The goal of this question is to see if any of the statistics are shifting from season to season. For example, we can try to see if teams are shooting more 3 pointers, or if overall scoring is increasing or decreasing. 

In [None]:
#compute averages for each statistic per season
season_trends = df_all.groupby("Season").mean(numeric_only=True).reset_index()

In [None]:
#picked following stats to plot 
stats_to_plot = ["PTS", "Opp PTS", "3PA", "3P", "TOV", "FGA", "FG"]

plt.figure(figsize=(14, 8))

for stat in stats_to_plot:
    sns.lineplot(data=season_trends, x="Season", y=stat, label=stat)

plt.xticks(rotation=45)
plt.title("How Team Statistics Have Shifted Over Time (2010–2025)")
plt.ylabel("League Average")
plt.xlabel("Season")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

After seeing this plot it becomes clear that during the 2020-2021 season the averages where greatly affected due to Covid 19 and the modified season of college basketball. In order to see the averages without this dip I created the graph below that ignores the 2020-2021 season. 

In [None]:
#make a new variable which contains all seasons expect 2020-2021
season_trends_no_covid = season_trends[season_trends["Season"] != "2020-2021"]

In [None]:
stats_to_plot = ["PTS", "Opp PTS", "3PA", "3P", "TOV", "FGA", "FG"]

plt.figure(figsize=(14, 8))

for stat in stats_to_plot:
    sns.lineplot(data=season_trends_no_covid, x="Season", y=stat, label=stat)

plt.xticks(rotation=45)
plt.title("How Team Statistics Have Shifted Over Time (2010–2025)")
plt.ylabel("League Average")
plt.xlabel("Season")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

### Takeaways

To analyze how league averages have shifted over time I used various key statistics, points, oppenent points, 3 point attempts, 3 points scored, turnovers, field goal attempts, and field goals scored. The first graph includes the 2020-2021 season and the second graph does not. I did this to show that the 2020-2021 season is an outlier due to the Covid-19 and the modified schedules. The second graph disregards the 2020-2021 season to show the more accurate trends across seasons. Additionally, I used the number of field goals and 3 pointers made instead of the percentage so that the lines on the graph were visable. At first I used the percentage and since some of the other values are so large the percentages looked like they were zero. These adjustments to the data/plots helped me draw concslusions about the trends over time. 


The plots above highlight that teams are attmepting and making more 3 pointers over time which is contributing to a gradual rise in overall scoring. Additionally, the end of the 2019-2020 season was the begining of Covid-19 which explains the slight decrease in overall averages. The lasting effect of the Covid-19 seasons is also shown by the effect of the 2021-2022 season. The leagues average in points scored is lower than 2015-2019 which could show the effect a season off/modified had on players and coaches. Additionally, turnovers have been showing a slight declone which could indicate improvements in ball handling and focus on offensive preparation. These factors can point towards the game moving towards a more offensive focused game that emphasizes higher scoring and more shot attempts versus defensive stops. 

## Question 3: Do certain SEC teams have statistical "signatures"? 

This question aims to answer if certain teams have a statistical signature. For example, does a team excel in making 3 point shots or in causing turnovers. I decided to compare teams within the SEC conference. 

In [None]:
#Selected teams: SEC Conference
sec_teams = [
    "Alabama",
    "Arkansas",
    "Auburn",
    "Florida",
    "Georgia",
    "Kentucky",
    "Louisiana State",
    "Mississippi State",
    "Mississippi",
    "Missouri",
    "South Carolina",
    "Tennessee",
    "Texas A&M",
    "Vanderbilt"
]

sec_df = df_all[df_all["School"].isin(sec_teams)].copy()
sec_df["School"].unique()


In [None]:
#pick columns to compate
sec_stat_cols = [ 
    "PTS", "Opp PTS",
    "FG%", "3P", "3PA", "FT%",
    "TRB", "AST", "STL", "BLK", "TOV",
    "SRS", "SOS"
]

#DF of columns with each SEC team
sec_team_stats = sec_df.groupby("School")[sec_stat_cols].mean()

In [None]:
#standardize the data using z scores 
#this way we can see realtive strengths and if teams are above/below in the SEC conference

sec_team_z = (sec_team_stats - sec_team_stats.mean()) / sec_team_stats.std()

In [None]:
#make plot to show how each team compares to other SEC teams in given category
plt.figure(figsize=(14, 8))
sns.heatmap(
    sec_team_z,
    cmap="coolwarm",
    center=0,
    annot=False
)

plt.title("SEC Team Statistical Signatures (Z-Scores Across 2010–2025)")
plt.xlabel("Statistic")
plt.ylabel("Team")
plt.tight_layout()
plt.show()


### Takeaways 

To explore whether teams in the SEC have distinct “statistical signatures,” I made a new dataframe only with the teams in the SEC conference. Then I computed the average in each of the given columns for all of the teams and standardized the findings using a z-score. I used z-scores and a heat map because the statistics beinf used vary highly in their values, for example, point sper season is in the thousands and blocks are around 50-150 per team. By using z-scores that allows us to compare these values realtively. The plot above highlights teams strengths and weaknesses relative to the average for the SEC conference. For example, Kentucky is well above the SEC averages for points scored, field goal percentage, total rebounds, assists, blocks, and SRS. Kentucky is a historically well established basketball school and this evidence proves there dominance in the SEC. Auburn is very middle of the pack based on this graph. in many of the categories Auburn is at average or very close to average. Some of the areas Auburn does best in include 3 point attempts and makes and some areas for improvement include field goal percentage, free throws, and turnovers. This plot highlights the meaningful differences between the way teams in the SEC play. This could be a helpful guide for coaches and players to see where their team excels and where improvements can be focused. 

## Question 4: Outliers per season and outliers in 2024-2025? 

For this question the exploration will take two approaches. The first will see how many outliers there are per season based on NET points scored using points scored - points against. Then we will also see the specific outliers of the 2024-2025 season. 

In [None]:
#using net for outliers, points scored - points against 
df_all["NET"] = df_all["PTS"] - df_all["Opp PTS"]

#using z scores to compare teams relatively 
df_all["NET_z"] = (df_all["NET"] - df_all["NET"].mean()) / df_all["NET"].std()

team_outliers = df_all[(df_all["NET_z"].abs() > 2)]
team_outliers[["School", "Season", "NET", "NET_z"]]

#plot showing how many outliers per season
plt.figure(figsize=(12,6))
sns.scatterplot(
    data=df_all,
    x="Season",
    y="NET",
    hue=(df_all["NET_z"].abs() > 2),
    palette={False: "gray", True: "red"}
)
plt.xticks(rotation=45)
plt.title("Outlier Teams by Net Points (PTS – Opp PTS)")
plt.show()


In [None]:
#Closer look at outliers within a specific season - 2024-2025
df_2025 = df_all[df_all["Season"] == "2024-2025"].copy()

#z scores for that season onlu
df_2025["NET_z"] = (df_2025["NET"] - df_2025["NET"].mean()) / df_2025["NET"].std()

outliers_2025 = df_2025[df_2025["NET_z"].abs() > 2]

#Table showing which teams had statiscally high net values in either direction based on z scores
outliers_2025[["School", "NET", "NET_z"]].sort_values("NET_z", ascending=False)


### Takeaways

To explore any outlier seasons overall and any outlier teams in the 2024-2025 season I used z-scores and the net points scored. The net points scored took the total points scored by a team and subtracted the points scored against them. Then using a z-score standardized the data set and allowed me to detect which teams where outliers. Overall, between each season there are lots of outliers on both ends of the spectrum. This speaks to basketball being a very dynamic and ever changing sport. It also shows how most teams stay within the average variance while some teams are truly outliers in either direction. 


Next, I decided to take a closer look in a specific season and I choose the past season, 2024-2025. I used the same process to find the z-scores and compare which teams had large point differnetial. As shown above Duke, UC San Diego, Florida, and even Auburn had seasons above the average. This makes sense as mos of the teams showing this outlier behavior played in the March Madness NCAA tournament. On the other end of the spectrum we can see that various teams underperformed this season and had more points scored against them and where also deemed as outliers. 

## Question 5: Richie

In [None]:
df_all['PPG'] = df_all['PTS'] / df_all['G']
df_all['PPG'] = df_all['PPG'].round(2)
ppg = df_all.pop("PPG")
df_all.insert(df_all.columns.get_loc("PTS") + 1, "PPG", ppg)

df_all['OPPG'] = df_all['Opp PTS'] / df_all['G']
df_all['OPPG'] = df_all['OPPG'].round(2)
ppg = df_all.pop("OPPG")
df_all.insert(df_all.columns.get_loc("Opp PTS") + 1, "OPPG", ppg)

conference_map = {
    "Alabama": "SEC", "Auburn": "SEC", "Arkansas": "SEC",
    "Florida": "SEC", "Georgia": "SEC", "Kentucky": "SEC",
    "Louisiana State": "SEC", "Mississippi": "SEC", "Mississippi State": "SEC",
    "Missouri": "SEC", "South Carolina": "SEC", "Tennessee": "SEC", 
    "Texas A&M": "SEC", "Vanderbilt": "SEC", "Texas": "SEC", "Oklahoma": "SEC",

    "Boston College": "ACC", "California": "ACC", "Clemson": "ACC",
    "Duke": "ACC", "Florida State": "ACC", "Georgia Tech": "ACC",
    "Louisville": "ACC", "North Carolina": "ACC", "Miami": "ACC",
    "NC State": "ACC", "Pittsburgh": "ACC", "Southern Methodist": "ACC",
    "Stanford": "ACC", "Syracuse": "ACC", "Virginia": "ACC",
    "Virginia Tech": "ACC", "Wake Forest": "ACC",

    "Illinois": "Big 10", "Indiana": "Big 10", "Iowa": "Big 10",
    "Maryland": "Big 10", "Michigan": "Big 10", "Michigan State": "Big 10",
    "Minnesota": "Big 10", "Nebraska": "Big 10", "Northwestern": "Big 10",
    "Ohio State": "Big 10", "Oregon": "Big 10", "Penn State": "Big 10",
    "Purdue": "Big 10", "Rutgers": "Big 10", "UCLA": "Big 10",
    "Southern California": "Big 10", "Washington": "Big 10", "Wisconsin": "Big 10",

    "Arizona": "Big 12", "Arizona State": "Big 12", "Baylor": "Big 12",
    "Brigham Young": "Big 12", "Cincinnati": "Big 12", "Colorado": "Big 12",
    "Houston": "Big 12", "Iowa State": "Big 12", "Kansas": "Big 12",
    "Kansas State": "Big 12", "Oklahoma State": "Big 12", "TCU": "Big 12",
    "Texas Tech": "Big 12", "UCF": "Big 12", "Utah": "Big 12",
    "West Virginia": "Big 12",

    "Seton Hall": "Big East", "Connecticut": "Big East", "Butler": "Big East",
    "Villanova": "Big East", "Georgetown": "Big East", "Xavier": "Big East",
    "Creighton": "Big East", "DePaul": "Big East", "St. John's": "Big East",
    "Marquette": "Big East", "Providence": "Big East"
}

df_all["Conference"] = df_all["School"].map(conference_map)

power5_df = df_all[df_all["Conference"].isin(["SEC", "ACC", "Big 10", "Big 12", "Big East"])]

In [None]:
stats = ["SRS", "SOS", "PPG", "OPPG", "FG%", "3P%", "FT%", "TRB", "AST", "STL", "BLK", "TOV"]

for stat in stats:
    g = sns.FacetGrid(
        power5_df,
        col="Season",
        col_wrap=5,               # 2 panels per row
        height=4.5,
        sharex=False,
        sharey=False
    )
    
    g.map_dataframe(
        sns.scatterplot,
        x=stat,
        y="W-L%",
        hue="Conference",             # if you want seasons as colors
        palette="tab10"
    )
    
    g.add_legend(title="Season")
    g.set_axis_labels(stat, "Win Percentage")
    g.fig.suptitle(f"{stat} vs W-L% by Conference", y=1.03)
    
    plt.show()

## Question 6: Richie

This is an exploration of how close a teams PPG is to their OPPG, their average point differential, vs how many games they won in a season.

In [None]:
df_all["Avg Point Dif"] = (df_all["PPG"] - df_all["OPPG"]).round(2)


plt.figure(figsize=(10,6))

sns.scatterplot(
    data=df_all,
    x="Avg Point Dif",      # Your computed point differential column
    y="W-L%",               # Assumes you already created a Win% column (0–100 or 0–1)
    alpha=0.7
)

plt.title("Point Differential vs Win Percentage")
plt.xlabel("Average Point Differential")
plt.ylabel("Win Percentage")
plt.grid(True, linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

## Question 7: Richie

In [None]:
champions = [
    "Florida", "Connecticut", "Connecticut", "Kansas", "Baylor",
    "No Champ", "Virginia", "Villanova", "North Carolina", "Villanova",
    "Duke", "Connecticut", "Louisville", "Kentucky", "Connecticut"
]

# Sort current seasons by reverse chronological order
seasons_sorted = sorted(df_all["Season"].unique(), reverse=True)

# Create a dataframe matching the champion of each season to that season
df_champs = pd.DataFrame({
    "Season": seasons_sorted[:len(champions)],  # take most recent 15 seasons
    "Champion": champions
})

# Merge champs into main dataframe
df_all = df_all.merge(df_champs, on="Season", how="left")

In [None]:
# Create a function to make a dataframe of only Auburn vs specific season champ
def build_compare_df(df_all, stats):
    rows = []

    for season in df_all["Season"].unique():
        champ = df_all.loc[df_all["Season"] == season, "Champion"].dropna()

        if champ.empty or champ.values[0] == "No Champ":
            continue

        champ_team = champ.values[0]

        # Auburn stats
        aub = df_all[(df_all["Season"] == season) & (df_all["School"] == "Auburn")]

        # Champion stats
        ch = df_all[(df_all["Season"] == season) & (df_all["School"] == champ_team)]

        if aub.empty or ch.empty:
            continue

        rows.append({
            "Season": season,
            "Team": "Auburn",
            **{s: aub[s].values[0] for s in stats}
        })

        rows.append({
            "Season": season,
            "Team": champ_team,
            **{s: ch[s].values[0] for s in stats}
        })

    return pd.DataFrame(rows)

In [None]:
stats = ["W-L%", "PPG", "BLK", "STL", "TRB", "AST"]
df_compare = build_compare_df(df_all, stats)

In [None]:
# Create a radar plot maker

def radar_plot(df_compare, season, stats):
    df_season = df_compare[df_compare["Season"] == season]

    aub = df_season[df_season["Team"] == "Auburn"][stats].values.flatten()
    champ = df_season[df_season["Team"] != "Auburn"][stats].values.flatten()

    max_vals = np.maximum(aub, champ)
    aub_norm = aub / max_vals
    champ_norm = champ / max_vals

    angles = np.linspace(0, 2 * np.pi, len(stats), endpoint=False)
    angles = np.concatenate([angles, [angles[0]]])

    aub_plot = np.concatenate([aub_norm, [aub_norm[0]]])
    champ_plot = np.concatenate([champ_norm, [champ_norm[0]]])

    plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)

    ax.plot(angles, aub_plot, color="orange", linewidth=2, label="Auburn")
    ax.fill(angles, aub_plot, color="orange", alpha=0.25)

    ax.plot(angles, champ_plot, color="blue", linewidth=2, label="Champion")
    ax.fill(angles, champ_plot, color="blue", alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(stats, fontsize=12)
    plt.title(f"Auburn vs Champion Radar Chart — {season}", fontsize=15)
    plt.legend(loc="upper right", bbox_to_anchor=(1.2, 1.1))
    plt.show()


In [None]:
# Make radar plots
for season in sorted(df_compare["Season"].unique(), reverse=True):
    radar_plot(df_compare, season, stats)

## Takeaways/Conclusion: Richie 

In [None]:
df_all.to_csv("final_data", index = False)