In [50]:
import pandas as pd

In [51]:
data = pd.read_csv("abcnews-date-text.csv")
data

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1226253,20201231,what abc readers learned from 2020 looking bac...
1226254,20201231,what are the south african and uk variants of ...
1226255,20201231,what victorias coronavirus restrictions mean f...
1226256,20201231,whats life like as an american doctor during c...


In [52]:
text_data = data[["headline_text"]]
text_data["index"] = text_data.index

In [53]:
print(len(data))
data[:5]

1226258


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


#Data preprocesing

1. Tokenization
2. Remove words with 3 or less characters
3. remove stop words
4. Lemmatization
5. Stemming

In [61]:
import pandas as pd
import gensim

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS 
from nltk import PorterStemmer, word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [62]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/angel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [68]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/angel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [70]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/angel/nltk_data...


True

In [76]:
def lematize_stemming(text):
    lem_verb = WordNetLemmatizer().lemmatize(text,pos="v")
    stemmerr = PorterStemmer()
    return stemmerr.stem(lem_verb)
def preprocess(doc):
    results=[]
    for token in simple_preprocess(doc):
        if token not in STOPWORDS and len(token) > 3:
            results.append(lematize_stemming(token))
            
    return results      

In [64]:
data[:1]

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...


In [65]:
doc_sample = text_data[text_data["index"] == 1].values[0][0]
doc_sample

'act fire witnesses must be aware of defamation'

In [69]:
word_tokenize(doc_sample)

['act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation']

In [77]:
preprocess(doc_sample)

['wit', 'awar', 'defam']

In [78]:
processed_doc = data["headline_text"].map(preprocess)
processed_doc[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

# Bag of words on dataset

In [79]:
dictionary = gensim.corpora.Dictionary(processed_doc)
count = 0

for k, v in dictionary.iteritems():
    print("key: ", k, ", Value: ", v)
    count += 1
    if count > 10:
        break

key:  0 , Value:  broadcast
key:  1 , Value:  commun
key:  2 , Value:  decid
key:  3 , Value:  licenc
key:  4 , Value:  awar
key:  5 , Value:  defam
key:  6 , Value:  wit
key:  7 , Value:  call
key:  8 , Value:  infrastructur
key:  9 , Value:  protect
key:  10 , Value:  summit


In [82]:
len(dictionary)

15281


### Filter of tokens that appears in

1. less than 15 documents 
2. more than 0.5 documents
3. keep only 100000 most frequent tokens

In [80]:
dictionary.filter_extremes(no_below=15,no_above=0.5,keep_n=100000)

In [81]:
len(dictionary)

15281

In [85]:
bow_corpus = [dictionary.doc2bow(data) for data in processed_doc]
bow_corpus[1]

[(4, 1), (5, 1), (6, 1)]