In [1]:
import numpy as np
import pandas as pd
import itertools

from sklearn.datasets import fetch_20newsgroups

import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
# Define prefix for all output files
output_prefix = '20newsgroups_pols_cleaned_nh'

# List categories of articles to grab
cats = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

In [4]:
# Get the articles from the 20 newsgroup dataset from sklearn
train = fetch_20newsgroups(subset='train', categories=cats)
test = fetch_20newsgroups(subset='test', categories=cats)

In [5]:
# Convert training and testing data to dataframe format
df_train = pd.DataFrame(train.data, columns=['text'])
df_test = pd.DataFrame(test.data, columns=['text'])

In [None]:
# Split of target data
y_train = pd.DataFrame(train.target)
y_test = pd.DataFrame(test.target)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1952 entries, 0 to 1951
Data columns (total 1 columns):
text    1952 non-null object
dtypes: object(1)
memory usage: 15.3+ KB


In [None]:
df_test.info()

In [None]:
y_train[0].value_counts()

In [None]:
y_test[0].value_counts()

In [9]:
# Define function to clean each article
def clean_text(block):
    # Skip header in article (Each header ends with two newlines)
    start = block.find('\n\n')  
    if 0 < start:
        block = block[start:]
    
    # Tokenize the article
    doc = nlp(block)
    
    # New list to append cleaned text to
    cleaned = []
    
    for token in doc:
        if not token.is_stop: # Don't keep stop words
            if not token.is_punct: # Don't keep punctuation
                cleaned.append(token.lemma_) # Keep lemma for each remaining token
    
    # Remove other punctuation missed earlier and join into a sentence
    cleaned = ' '.join(cleaned).replace('>', '').replace('<', '').replace('^', '').replace('\n', '').replace('\t', '').replace('|', '').strip()
    
    # Remove excess whitespace between words
    return ' '.join(cleaned.split())

In [12]:
# Make a copy of the training dataframe to save cleaned data to
df_train_cp = df_train.copy()

# Loop through and clean all articles
per = 5
for i, text in enumerate(df_train.text):
    df_train_cp.text[i] = clean_text(text)
    
    if int(i*100/len(df_train)) == per:
        print(per, '% finished')
        per += 5
print('100 % finished')

# Join target data back to cleaned dataframe and save
df_train_tot = df_train_cp.copy()
df_train_tot['target'] = y_train[0]

outfile_1 = output_prefix + '_train.csv'
df_train_tot.to_csv(outfile_1)

print('training data saved to', outfile_1)

5 % finished
10 % finished
15 % finished
20 % finished
25 % finished
30 % finished
35 % finished
40 % finished
45 % finished
50 % finished
55 % finished
60 % finished
65 % finished
70 % finished
75 % finished
80 % finished
85 % finished
90 % finished
95 % finished
100 % finished


In [13]:
# Make a copy of the training dataframe to save cleaned data to
df_test_cp = df_test.copy()

# Loop through and clean all articles
per = 5
for i, text in enumerate(df_test.text):
    df_test_cp.text[i] = clean_text(text)
    
    if int(i*100/len(df_test)) == per:
        print(per, '% finished')
        per += 5
print('100 % finished')

# Join target data back to cleaned dataframe and save
df_test_tot = df_test_cp.copy()
df_test_tot['target'] = y_test[0]

outfile_2 = output_prefix + '_test.csv'
df_test_tot.to_csv(outfile_2)

print('test data saved to', outfile_2)

5 % finished
10 % finished
15 % finished
20 % finished
25 % finished
30 % finished
35 % finished
40 % finished
45 % finished
50 % finished
55 % finished
60 % finished
65 % finished
70 % finished
75 % finished
80 % finished
85 % finished
90 % finished
95 % finished
100 % finished
