# Exercise 2. Text Preprocessing

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In this exercise, we will be using the 20-Newsgroups dataset. This version of the dataset contains about 11k newsgroups posts from 20 different topics.

We will learn how to apply the following steps:

1. Import and examine data
2. Remove initial text metadata with regex
3. Remove numbers, punctuation, tabs and convert to lower case with gensim
4. Stopwords and short words removal
5. Stemming and lematization

In [24]:
# Import packages
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
import pickle
import en_core_web_sm
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords

# 1. Import and examine data

In [26]:
# Import dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [27]:
# Examine dataset
print('Possible tagret values:')
print(df.target_names.unique())
print(' ')
print('Class distribution:')
print(df.target_names.value_counts())

Possible tagret values:
['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']
 
Class distribution:
rec.sport.hockey            600
soc.religion.christian      599
rec.motorcycles             598
rec.sport.baseball          597
sci.crypt                   595
sci.med                     594
rec.autos                   594
comp.windows.x              593
sci.space                   593
comp.os.ms-windows.misc     591
sci.electronics             591
comp.sys.ibm.pc.hardware    590
misc.forsale                585
comp.graphics               584
comp.sys.mac.hardware       578
talk.politics.mideast       564
talk.politics.guns          546
alt.atheism                 48

*The classes are almost uniformly distributed.*

In [28]:
# The first entry in the content field
print(df.content[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [29]:
df.shape

(11314, 3)

*The data contains 11314 rows.*

# 2. Remove initial text metadata

In [30]:
# Remove unnecessary headers
#(8768|9875|2353)
data = [re.sub('(?m)^(From:|Article-I.D.:|Organization:|Lines:|Nntp-Posting-Host:|Distribution:|Reply-To:|X-Newsreader:|Expires:|\s*-+).*\n', '', sent, flags=re.I) for sent in df.content]
data = [re.sub('(Subject:|Summary:|Keywords:)', '', sent, flags=re.I) for sent in data]

#print(df.iloc[10]['content'])
print(data[0])

 WHAT car is this!?

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,







# 3. Remove numbers, punctuation, tabs and convert to lower case with gensim

In [31]:
# Remove numbers
data=[strip_numeric(sent) for sent in data]
print(data[0])

 WHAT car is this!?

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a -door sports car, looked to be from the late s/
early s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,







In [32]:
# Remove punctuation
data=[strip_punctuation(sent) for sent in data]
print(data[0])

 WHAT car is this 

 I was wondering if anyone out there could enlighten me on this car I saw
the other day  It was a  door sports car  looked to be from the late s 
early s  It was called a Bricklin  The doors were really small  In addition 
the front bumper was separate from the rest of the body  This is 
all I know  If anyone can tellme a model name  engine specs  years
of production  where this car is made  history  or whatever info you
have on this funky looking car  please e mail 

Thanks 







In [33]:
# Remove multiple spaces
data=[strip_multiple_whitespaces(sent) for sent in data]
print(data[0])

 WHAT car is this I was wondering if anyone out there could enlighten me on this car I saw the other day It was a door sports car looked to be from the late s early s It was called a Bricklin The doors were really small In addition the front bumper was separate from the rest of the body This is all I know If anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please e mail Thanks 


In [34]:
# Transform all data to lower-case
data=[sent.lower() for sent in data]
print(data[0])

 what car is this i was wondering if anyone out there could enlighten me on this car i saw the other day it was a door sports car looked to be from the late s early s it was called a bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all i know if anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please e mail thanks 


# 4. Stopwords and short words removal 

Here we will compare the stopwords in gensim and nltk.

#### Gensim

In [35]:
all_stopwords = STOPWORDS
print(sorted(all_stopwords))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found

#### NLTK

In [36]:
nltk_stopwords = set(stopwords.words('english'))
print(sorted(nltk_stopwords))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [37]:
data=[remove_stopwords(sent) for sent in data]
print(data[0])

car wondering enlighten car saw day door sports car looked late s early s called bricklin doors small addition bumper separate rest body know tellme model engine specs years production car history info funky looking car e mail thanks


In [38]:
data=[strip_short(sent) for sent in data]
print(data[0])

car wondering enlighten car saw day door sports car looked late early called bricklin doors small addition bumper separate rest body know tellme model engine specs years production car history info funky looking car mail thanks


# 5. Stemming and Lematization

In [39]:
data_stem=[stem_text(sent) for sent in data]
#data_stem=[strip_short(sent) for sent in data_stem]
print(data_stem[0])

car wonder enlighten car saw dai door sport car look late earli call bricklin door small addit bumper separ rest bodi know tellm model engin spec year product car histori info funki look car mail thank


In [40]:
# Initialize spacy 'en' model
nlp = en_core_web_sm.load()
# Parse the sentence using the loaded 'en' model object `nlp`
data_lem_base=[nlp(sent) for sent in data]
# Extract the lemma for each token and join
data_lem=[" ".join([token.lemma_ for token in sent]) for sent in data_lem_base]
print(data_lem[0])

car wonder enlighten car see day door sport car look late early call bricklin door small addition bumper separate rest body know tellme model engine specs year production car history info funky looking car mail thank


# 6. Corpus storage

IMPORTNANT: store your preprocessed corpus as you don’t want to do this over and over again.

In [41]:
pickle.dump(data_stem, open("/content/drive/MyDrive/TWSM_Data/Stemmed.pkl", "wb"))
pickle.dump(data_lem, open("/content/drive/MyDrive/TWSM_Data/Lemma.pkl", "wb"))