In [2]:
import os
import glob
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import watchcbb.utils as utils
import watchcbb.efficiency as eff
import watchcbb.sql as sql

In [29]:
df_teams = sql.df_from_query("""
    SELECT * from teams
""")
df_teams.head()

Unnamed: 0,team_id,conference,location,year_start,year_end
0,abilene-christian,Southland,"Abilene, Texas",1971,2020
1,air-force,MWC,"USAF Academy, Colorado",1958,2020
2,akron,MAC,"Akron, Ohio",1902,2020
3,alabama-am,SWAC,"Normal, Alabama",2000,2020
4,alabama,SEC,"Tuscaloosa, Alabama",1913,2020


In [6]:
df_games = sql.df_from_query("""
    SELECT * from game_data
    WHERE "Season">=2018
    """
)
print(df_games.shape)
df_games.head()

(16470, 37)


Unnamed: 0,Season,Date,gid,Type,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,poss
0,2018,2017-11-10,2017-11-10_maryland_stony-brook,RG,maryland,76,stony-brook,61,N,0,...,6,9,5,19,15,13,7,2,19,74.2
1,2018,2017-11-10,2017-11-10_massachusetts_massachusetts-lowell,RG,massachusetts,74,massachusetts-lowell,69,H,0,...,23,30,5,21,15,13,9,2,21,71.58
2,2018,2017-11-10,2017-11-10_marquette_mount-st-marys,RG,marquette,80,mount-st-marys,59,H,0,...,6,11,4,19,12,15,3,3,16,67.04
3,2018,2017-11-10,2017-11-10_marshall_tennessee-martin,RG,marshall,102,tennessee-martin,91,H,1,...,12,15,9,22,16,21,4,0,29,82.28
4,2018,2017-11-10,2017-11-10_michigan-state_north-florida,RG,michigan-state,98,north-florida,66,H,0,...,9,15,8,14,8,19,9,2,24,83.34


In [7]:
df_reddit = sql.df_from_query("""
    SELECT date,gid,is_postgame,ups,num_comments from gamethreads
    """
)
print(df_reddit.shape)
df_reddit.head()

(14387, 5)


Unnamed: 0,date,gid,is_postgame,ups,num_comments
0,2017-11-10,2017-11-10_alabama-birmingham_jacksonville,True,9,0
1,2017-11-10,2017-11-10_alabama_memphis,False,6,81
2,2017-11-10,2017-11-10_albany-ny_iona,False,7,8
3,2017-11-10,2017-11-10_arizona_northern-arizona,False,13,82
4,2017-11-10,2017-11-10_austin-peay_vanderbilt,False,4,2


In [18]:
df_comments = sql.df_from_query("""
    SELECT gid,is_postgame,author,author_flair from gamethread_comments
""")
# df_comments.author_flair.head(20)
df_comments.author_flair = df_comments.author_flair.apply(lambda x: [x.strip('"') for x in x.strip('{}').split(',')])
print(df_comments.shape)
df_comments.head()

(387257, 4)


Unnamed: 0,gid,is_postgame,author,author_flair
0,2019-11-06_binghamton_cornell,True,CaffeinationGoat,"[Connecticut Huskies, Binghamton Bearcats]"
1,2019-11-06_binghamton_cornell,True,cheesoidhateself,[Cornell Big Red]
2,2019-11-06_binghamton_cornell,True,PAPA_JOHNS_ZIMBABWE,[NJIT Highlanders]
3,2019-11-06_binghamton_cornell,True,IsYouWitItYaBish,[Wisconsin Badgers]
4,2019-11-06_boston-college_wake-forest,False,mrfixit420,[Wake Forest Demon Deacons]


In [30]:
flairs = pd.Series([x for subl in df_comments.author_flair.values.tolist() for x in subl])
print(flairs.nunique())
flairs.value_counts().head(20)

591


                              36516
Michigan State Spartans       19424
Michigan Wolverines           19241
Purdue Boilermakers           15694
Kentucky Wildcats             14637
Virginia Cavaliers            14370
Maryland Terrapins            13355
Kansas Jayhawks               12559
North Carolina Tar Heels      12538
Iowa Hawkeyes                 11603
Indiana Hoosiers               9909
West Virginia Mountaineers     9507
Duke Blue Devils               8413
Syracuse Orange                8221
NC State Wolfpack              8028
Cincinnati Bearcats            7885
Wisconsin Badgers              7421
Ohio State Buckeyes            7233
Illinois Fighting Illini       7138
Iowa State Cyclones            7106
dtype: int64

In [49]:
team_ids = df_teams.team_id.values.tolist()
mapping = {}
for flair in flairs.value_counts().index:
    parts = flair.lower().replace('&','').strip().split()
    for i in range(len(parts)):
        f = '-'.join(parts[:len(parts)-i])
        if f in team_ids and f not in mapping.values():
            mapping[flair] = f
            break
print(len(mapping))
pd.Series(flairs[~flairs.isin(mapping.keys())]).value_counts().head(20)

260


                             36516
NC State Wolfpack             8028
Final Four                    2636
Big East                      2552
VCU Rams                      1941
St. John's (NY) Red Storm     1827
LSU Tigers                    1770
Big Ten                       1401
NIT                           1363
March Madness                 1348
Yeshiva Maccabees             1028
UMBC Retrievers                963
Poll Veteran                   925
Best Of Winner                 816
UCF Knights                    778
TCU Horned Frogs               767
UNC Greensboro Spartans        749
USC Trojans                    648
r                              605
CollegeBasketball              575
dtype: int64