# The data
The data set we’ll use is a list of over one million news headlines published over a period of 15 years and can be downloaded from [Kaggle](https://www.kaggle.com/therohk/million-headlines/data).

In [5]:
import pandas as pd
df = pd.read_csv('data/abcnews-date-text.csv', error_bad_lines=False)
df

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1186013,20191231,vision of flames approaching corryong in victoria
1186014,20191231,wa police and government backflip on drug amne...
1186015,20191231,we have fears for their safety: victorian premier
1186016,20191231,when do the 20s start


# Data pre-processing
We will perform the following steps:
* Tokenization: split the text into sentences and the sentences into words. Lowercase the words and remove punctuation;
* Words that have fewer than 3 characters are removed;
* All stopwords are removed;
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present;
* Words are stemmed — words are reduced to their root form.

In [6]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk import download
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet')

import numpy as np
np.random.seed(59)

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

## Preview the preprocessing

In [11]:
doc_sample = df.iloc[4310, : ]
print(doc_sample)
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

publish_date                                              20030311
headline_text    ratepayers group wants compulsory local govt v...
Name: 4310, dtype: object
original document: 


AttributeError: 'Series' object has no attribute 'split'