### Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from nltk.stem import PorterStemmer, WordNetLemmatizer
# from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

### Data

In [2]:
df_cfb = pd.read_csv('../datasets/subreddit_cfb', usecols = ['title', 'subreddit', 'selftext'])
print(df_cfb.shape)
df_cfb.head()

(2500, 3)


Unnamed: 0,selftext,subreddit,title
0,I have seen several times during all the re-al...,CFB,How do research budgets impact conferences?
1,[removed],CFB,Is there a chance Pac12 can just kick out WSU?...
2,,CFB,[Sam Block] - The last time Michigan beat Ohio...
3,[removed],CFB,Rename the Big Ten (which has 14 members) and ...
4,,CFB,"Nick Saban on Jahleel Billingsley, team: “This..."


In [3]:
df_bball = pd.read_csv('../datasets/subreddit_cbball', usecols = ['title', 'subreddit', 'selftext'])
print(df_bball.shape)
df_bball.head()

(2500, 3)


Unnamed: 0,selftext,subreddit,title
0,***The Massachusetts Showdown***\n\nIn this se...,CollegeBasketball,The Intra-State Showdown: Massachusetts
1,"***The Maryland Showdown***\n\nIn this series,...",CollegeBasketball,The Intra-State Showdown: Maryland
2,***The Louisiana Showdown***\n\nIn this series...,CollegeBasketball,The Intra-State Showdown: Louisiana
3,"B10 has 14 members, B12 has 8. The other power...",CollegeBasketball,Rename the Big Ten and Big 12
4,,CollegeBasketball,2020 NBA Mock Draft (First Round)


In [4]:
df = pd.concat([df_cfb, df_bball])
print(df.shape)
df.head()

(5000, 3)


Unnamed: 0,selftext,subreddit,title
0,I have seen several times during all the re-al...,CFB,How do research budgets impact conferences?
1,[removed],CFB,Is there a chance Pac12 can just kick out WSU?...
2,,CFB,[Sam Block] - The last time Michigan beat Ohio...
3,[removed],CFB,Rename the Big Ten (which has 14 members) and ...
4,,CFB,"Nick Saban on Jahleel Billingsley, team: “This..."


### Data Cleaning

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 2499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   selftext   2362 non-null   object
 1   subreddit  5000 non-null   object
 2   title      4999 non-null   object
dtypes: object(3)
memory usage: 156.2+ KB


In [6]:
# College Football is 0 and College Basketball is 1
df['subreddit'] = df['subreddit'].map({'CFB': 0, 'CollegeBasketball': 1})
df['subreddit'].value_counts()

0    2500
1    2500
Name: subreddit, dtype: int64

In [7]:
df['selftext'] = df['selftext'].map(lambda cell: np.nan if cell == '[removed]' else str())
df.fillna('', inplace = True)

In [8]:
df['post'] = df['title'] + ' ' + df['selftext']
df['post_length'] = df['post'].str.len()
df['post_word_count'] = df['post'].str.count(' ') + 1

In [9]:
# text cleaning
#df['post'] = df['post'].apply(lambda x: re.sub(r'http\S+', '', x))

In [10]:
df['post'] = [re.sub('[\(\[].*?[\)\]]', '', t.replace('\n', '')) for t in df.post]
df['post'] = [t.replace('/\\', ' ').replace(':-)', ' ').replace('tdb> ', ' ') 
              for t in df.post]
df['post'] = [t.replace('=', ' ').replace('--', ' ').replace('_', ' ')\
              .replace('}', ' ').replace('*', ' ').replace('^', ' ')\
              .replace('~', ' ')
              for t in df.post]
df['post'] = [t.replace('- < > -', ' ').replace('|', ' ').replace('*-', ' ') 
              for t in df.post]
df['post'] = [t.replace('...', ' ').replace('\t', ' ').lower().strip() 
              for t in df.post]

In [11]:
stemmer = PorterStemmer()
df['stemmed_post'] = [stemmer.stem(w) for w in df['post']]

lemmatizer = WordNetLemmatizer()
df['stemmed_post'] = [lemmatizer.lemmatize(w) for w in df['post']]

In [12]:
df.to_csv('../datasets/cfb_cbball', index = False)

## EDA