In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:

df = pd.read_csv("cbb.csv")

df.head(5)

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


In [4]:
#inspect df 
df.shape

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3885 entries, 0 to 3884
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM        3885 non-null   object 
 1   CONF        3885 non-null   object 
 2   G           3885 non-null   int64  
 3   W           3885 non-null   int64  
 4   ADJOE       3885 non-null   float64
 5   ADJDE       3885 non-null   float64
 6   BARTHAG     3885 non-null   float64
 7   EFG_O       3885 non-null   float64
 8   EFG_D       3885 non-null   float64
 9   TOR         3885 non-null   float64
 10  TORD        3885 non-null   float64
 11  ORB         3885 non-null   float64
 12  DRB         3885 non-null   float64
 13  FTR         3885 non-null   float64
 14  FTRD        3885 non-null   float64
 15  2P_O        3885 non-null   float64
 16  2P_D        3885 non-null   float64
 17  3P_O        3885 non-null   float64
 18  3P_D        3885 non-null   float64
 19  ADJ_T       3885 non-null  

In [None]:
# check for na 
missing_values = df.isna().sum()
missing_values
# not using the columns with missing values so don't need to delete rows 

TEAM             0
CONF             0
G                0
W                0
ADJOE            0
ADJDE            0
BARTHAG          0
EFG_O            0
EFG_D            0
TOR              0
TORD             0
ORB              0
DRB              0
FTR              0
FTRD             0
2P_O             0
2P_D             0
3P_O             0
3P_D             0
ADJ_T            0
WAB              0
POSTSEASON    3137
SEED          3137
YEAR             0
dtype: int64

In [6]:
#select featyres
features = ["ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D", "TOR"]

In [None]:
# Scale
scaler = StandardScaler()
scaled = scaler.fit_transform(df[features])

In [None]:
# calc cosine similarity 

cos_sim = cosine_similarity(scaled)


In [None]:
#teams 
teams = ["North Carolina", "Wisconsin"]

for team in teams:
    # find index of the team 
    # there are mult of each team because of different years 
    index = df[df["TEAM"] == team].index

    first_index = index[0]

    #pairing
    team_similarities = list(enumerate(cos_sim[first_index]))

    #sort by similarity
    team_similarities = sorted(team_similarities, key=lambda x: x[1], reverse=True)

    #top 10 most similar teams
    top_similar = team_similarities[1:11]


    print(f"\nTop 10 Most Similar Teams to {team}:")
    for rank, (i, score) in enumerate(top_similar, 1):
        similar_team = df.iloc[i]["TEAM"]
        conf = df.iloc[i]["CONF"]
        year = df.iloc[i]["YEAR"]
        print(f"{rank}. {similar_team} ({conf}, {year}), Similarity: {score:.3f}")
    



Top 10 Most Similar Teams to North Carolina:
1. Oregon (P12, 2016), Similarity: 0.996
2. Ohio St. (B10, 2021), Similarity: 0.994
3. Purdue (B10, 2019), Similarity: 0.992
4. Florida (SEC, 2024), Similarity: 0.988
5. Texas Tech (B12, 2024), Similarity: 0.987
6. George Washington (A10, 2016), Similarity: 0.986
7. LSU (SEC, 2021), Similarity: 0.984
8. Michigan St. (B10, 2023), Similarity: 0.984
9. Michigan (B10, 2013), Similarity: 0.982
10. North Carolina (ACC, 2017), Similarity: 0.981

Top 10 Most Similar Teams to Wisconsin:
1. Iowa (B10, 2021), Similarity: 0.997
2. Wisconsin (B10, 2014), Similarity: 0.991
3. Notre Dame (ACC, 2017), Similarity: 0.991
4. Duke (ACC, 2016), Similarity: 0.987
5. Michigan (B10, 2013), Similarity: 0.986
6. Iowa St. (B12, 2017), Similarity: 0.985
7. Duke (ACC, 2024), Similarity: 0.984
8. Ohio St. (B10, 2021), Similarity: 0.984
9. Iowa St. (B12, 2014), Similarity: 0.984
10. Butler (BE, 2016), Similarity: 0.984
