### Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data

In [2]:
df_cfb = pd.read_csv('../datasets/subreddit_cfb', usecols = ['title', 'subreddit', 'selftext'])
print(df_cfb.shape)
df_cfb.head()

(1000, 3)


Unnamed: 0,selftext,subreddit,title
0,,CFB,AD Allen Greene: Auburn still planning on full...
1,Mine is 2015 Michigan State-Michigan.\n\nFor t...,CFB,Most fun road game you've attended?
2,[removed],CFB,Best/cheapest way for canadian trying to watch
3,[removed],CFB,Who are some of the best offensive coordinator...
4,Goooooood morning /r/CFB! I am your host and h...,CFB,Do you like rooting for the Underdog week in a...


In [3]:
df_bball = pd.read_csv('../datasets/subreddit_cbball', usecols = ['title', 'subreddit', 'selftext'])
print(df_bball.shape)
df_bball.head()

(1000, 3)


Unnamed: 0,selftext,subreddit,title
0,,CollegeBasketball,Sister Jean Turns 102!
1,,CollegeBasketball,#14 North Carolina 2021-22 Preview — Three-Man...
2,I would definitely watch. It'd be interesting ...,CollegeBasketball,Will CBB teams ever play OTE teams in an exhib...
3,I’ll do a general breakdown of the teams and h...,CollegeBasketball,My ACC preseason rankings and season overviews
4,https://www.indystar.com/story/news/crime/2021...,CollegeBasketball,Butler University Student And Basketball Team ...


In [4]:
df = pd.concat([df_cfb, df_bball])
print(df.shape)
df.head()

(2000, 3)


Unnamed: 0,selftext,subreddit,title
0,,CFB,AD Allen Greene: Auburn still planning on full...
1,Mine is 2015 Michigan State-Michigan.\n\nFor t...,CFB,Most fun road game you've attended?
2,[removed],CFB,Best/cheapest way for canadian trying to watch
3,[removed],CFB,Who are some of the best offensive coordinator...
4,Goooooood morning /r/CFB! I am your host and h...,CFB,Do you like rooting for the Underdog week in a...


### Data Cleaning

In [5]:
# College Football is 0 and College Basketball is 1
df['subreddit'] = df['subreddit'].map({'CFB': 0, 'CollegeBasketball': 1})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   selftext   991 non-null    object
 1   subreddit  2000 non-null   int64 
 2   title      2000 non-null   object
dtypes: int64(1), object(2)
memory usage: 62.5+ KB


In [7]:
df[df.duplicated(subset = 'selftext')]

Unnamed: 0,selftext,subreddit,title
3,[removed],0,Who are some of the best offensive coordinator...
5,[removed],0,ELI5: why is the ACC/Big Ten/Pac-12 alliance t...
6,[removed],0,16-Team Playoff Scenarios
8,,0,Ryan Day names CJ Stroud as Ohio State Startin...
9,,0,OSU Football announces CJ Stroud as the starti...
...,...,...,...
995,,1,This is the newest project of him?
996,[removed],1,Who was it that hit the game winner in the fin...
997,,1,Easy Money !!
998,,1,[Goodman] Marquette transfer Dawson Garcia tol...


In [8]:
df.fillna('', inplace = True)
df['post'] = df['title'] + ' ' + df['selftext']
df['post_length'] = df['post'].str.len()
df['post_word_count'] = df['post'].str.count(' ') + 1
print(df.shape)
df.head()

(2000, 6)


Unnamed: 0,selftext,subreddit,title,post,post_length,post_word_count
0,,0,AD Allen Greene: Auburn still planning on full...,AD Allen Greene: Auburn still planning on full...,115,17
1,Mine is 2015 Michigan State-Michigan.\n\nFor t...,0,Most fun road game you've attended?,Most fun road game you've attended? Mine is 20...,311,58
2,[removed],0,Best/cheapest way for canadian trying to watch,Best/cheapest way for canadian trying to watch...,56,8
3,[removed],0,Who are some of the best offensive coordinator...,Who are some of the best offensive coordinator...,97,14
4,Goooooood morning /r/CFB! I am your host and h...,0,Do you like rooting for the Underdog week in a...,Do you like rooting for the Underdog week in a...,1676,292


In [9]:
df['title'].sort_values()

670    "Alabama had only SIX 4th downs in quarters 1-...
418    "And the Ball is Free" is the greatest sports ...
567                         "Back to my playing weight."
1      #14 North Carolina 2021-22 Preview — Three-Man...
14          #15 Oregon 2021-22 Preview — Three-Man-Weave
                             ...                        
347    यथार्थ_कबीर_पंथ 600 years ago from today, God ...
636    “Gators forward Samson Ruzhentsev is no longer...
202                                            오즈코리아먹튀제보
989    ￼ Adam Schefter - Florida State announced the ...
268                                                    😏
Name: title, Length: 2000, dtype: object

In [10]:
df['title'][~df['title'].str.isnumeric()]

0      AD Allen Greene: Auburn still planning on full...
1                    Most fun road game you've attended?
2         Best/cheapest way for canadian trying to watch
3      Who are some of the best offensive coordinator...
4      Do you like rooting for the Underdog week in a...
                             ...                        
995                   This is the newest project of him?
996    Who was it that hit the game winner in the fin...
997                                        Easy Money !!
998    [Goodman] Marquette transfer Dawson Garcia tol...
999    [Jon Rothstein] Dawson Garcia tells me that he...
Name: title, Length: 1999, dtype: object