In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import time
import warnings
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')

### Read in Data

In [2]:
nhl_df = pd.read_csv('data/posts_nhl_2.csv')
nba_df = pd.read_csv('data/posts_nba_2.csv')
print(nhl_df.shape)
print(nba_df.shape)

(3000, 86)
(3000, 82)


### Clean

- Remove all columns except 'title' and 'selftext'
- Drop any null rows
- Drop duplicate rows
- Create new combination column for easier analysis 
- Concatenate into a single dataframe

In [3]:
nhl_df.head()

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,...,gallery_data,is_gallery,media_metadata,poll_data,removed_by_category,link_flair_css_class,author_cakeday,edited,suggested_sort,banned_by
0,0,[],False,akromyk,,[],,text,t2_htlnd,False,...,,,,,,,,,,
1,1,[],False,akromyk,,[],,text,t2_htlnd,False,...,,,,,,,,,,
2,2,[],False,billpat-joe-dinosuar,,[],,text,t2_27n7a0ke,False,...,,,,,,,,,,
3,3,[],False,bronxi11,,[],,text,t2_42ohhult,False,...,,,,,,,,,,
4,4,[],False,Hollaback_Boy,,[],,text,t2_3el1cidt,False,...,,,,,,,,,,


In [3]:
nhl_df = nhl_df[['title', 'selftext']]
nhl_df['label'] = 1
nhl_df.head(3)

Unnamed: 0,title,selftext,label
0,What's this I've been hearing about turning NH...,Can someone fill me in on what's going on with...,1
1,ELI5: What's this about turning NHL jerseys in...,"I can't watch local games, I can't play the si...",1
2,Hockey player hits Russian Referee,,1


In [4]:
# Remove extra columns and create target column
nba_df = nba_df[['title', 'selftext']]
nba_df['label'] = 0
nba_df.head(3)

Unnamed: 0,title,selftext,label
0,OC: Rookies Adjusting To The NBA (easier for s...,[**Ra asked me**](https://mediumlights.substac...,0
1,One of the highest upvoted posts and greatest ...,,0
2,2020 Lakers were up at least 30 points in each...,#[Game 4 vs Portland](https://www.basketball-r...,0


In [5]:
#Concat data
df = pd.concat([nhl_df, nba_df]).reset_index()
df.drop(columns = 'index', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     6000 non-null   object
 1   selftext  2724 non-null   object
 2   label     6000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 140.8+ KB


In [6]:
#drop nulls
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2724 entries, 0 to 5998
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2724 non-null   object
 1   selftext  2724 non-null   object
 2   label     2724 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 85.1+ KB


In [7]:
#drop duplicates
df.drop_duplicates(inplace=True)

In [8]:
# Combine text to new column
df['combo'] = df['title'] + df['selftext']
df['combo'].head()

0     What's this I've been hearing about turning NH...
1     ELI5: What's this about turning NHL jerseys in...
4     Does ESPN+ Currently Have Last Seasons Games?T...
14    Do Little League and kids Hickey teams use NHL...
17    About to be a New Yorker and want to get into ...
Name: combo, dtype: object

### Stemming and lemmatizing data

Creating a stemmed and lemmatized version of the new "combo" data will allow us to test how the two compare in the Random Forest model.

In [9]:
# Create Lemmatized column
lemmatizer = WordNetLemmatizer()
df['combo_lem'] = [lemmatizer.lemmatize(i) for i in df['combo']]

# Create Stemmed column
stemmer = PorterStemmer()
df['combo_stem'] = [stemmer.stem(i) for i in df['combo']]
df.head()

Unnamed: 0,title,selftext,label,combo,combo_lem,combo_stem
0,What's this I've been hearing about turning NH...,Can someone fill me in on what's going on with...,1,What's this I've been hearing about turning NH...,What's this I've been hearing about turning NH...,what's this i've been hearing about turning nh...
1,ELI5: What's this about turning NHL jerseys in...,"I can't watch local games, I can't play the si...",1,ELI5: What's this about turning NHL jerseys in...,ELI5: What's this about turning NHL jerseys in...,eli5: what's this about turning nhl jerseys in...
4,Does ESPN+ Currently Have Last Seasons Games?,"Trying to fill the hole in my life, but I don'...",1,Does ESPN+ Currently Have Last Seasons Games?T...,Does ESPN+ Currently Have Last Seasons Games?T...,does espn+ currently have last seasons games?t...
14,Do Little League and kids Hickey teams use NHL...,I used to play Hockey in the Little Leagues (P...,1,Do Little League and kids Hickey teams use NHL...,Do Little League and kids Hickey teams use NHL...,do little league and kids hickey teams use nhl...
17,About to be a New Yorker and want to get into ...,About to be a New Yorker and want to get into ...,1,About to be a New Yorker and want to get into ...,About to be a New Yorker and want to get into ...,about to be a new yorker and want to get into ...


### Export data

In [10]:
# Export to csv
df.to_csv('data/nhl_nba_df.csv')