In [1]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import pickle

# Preprocess data for classifier training
This includes assigning labels to the data and splitting the data into training and testing sets.

Most of the data cleaning will be performed in the feature extraction step using sklearn. This includes making all words lowercase, removing punctuation, removing non-alphabet characters (including numbers and emojis), and removing stopwords. This step will be part of the training/tuning phase.

## Load data

In [2]:
endometriosis_sub_posts = pd.read_csv('../data/endometriosis_sub_data.csv')
endometriosis_com_posts = pd.read_csv('../data/endometriosis_comment_data.csv')
endo_sub_posts = pd.read_csv('../data/Endo_sub_data.csv')
endo_com_posts = pd.read_csv('../data/Endo_comment_data.csv')
endo_posts = pd.concat([endometriosis_sub_posts, endometriosis_com_posts, endo_sub_posts, endo_com_posts])
endo_posts = endo_posts.loc[~endo_posts['text'].isna()].reset_index(drop=True)

pcos_sub_posts = pd.read_csv('../data/PCOS_sub_data.csv')
pcos_com_posts = pd.read_csv('../data/PCOS_comment_data.csv')
pcos_posts = pd.concat([pcos_sub_posts, pcos_com_posts])
pcos_posts = pcos_posts.loc[~pcos_posts['text'].isna()].reset_index(drop=True)

In [3]:
# Use NLTK stopwords plus some additional curated stopwords
stop_words = stopwords.words('english')
more_stopwords = ['endo','endometriosis','pcos','polycystic','also','one','time','even','symptom','symptoms','know',
                  'like','think','though','really','would','still','going','thing','doctor','get',"i'm","i've",'said',
                  'want','told','could','thought','lot','that','since','say','thank']
stop_words.extend(more_stopwords)
stop_words = set(stop_words)

In [4]:
print('There are %d endometriosis-related posts.' %len(endo_posts))
print('There are %d PCOS-related posts.' %len(pcos_posts))

There are 39464 endometriosis-related posts.
There are 37204 PCOS-related posts.


In [5]:
endo_posts['label'] = 1
pcos_posts['label'] = 0

all_posts = pd.concat([endo_posts, pcos_posts]).reset_index(drop=True)

## Split data

In [6]:
train_data, test_data = train_test_split(all_posts, test_size=0.3, shuffle=True)

In [7]:
train_data = train_data[['text','label']].reset_index(drop=True)
test_data = test_data[['text','label']].reset_index(drop=True)

## Save data sets and stopwords

In [8]:
train_data.to_pickle('../data/train_data.pkl')
test_data.to_pickle('../data/test_data.pkl')

with open('../data/stopwords.pkl', 'wb') as f:
    pickle.dump(list(stop_words), f)