# Text Preprocessing - Updated

## Import Libraries

In [10]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize

## Define and add Stopwords

In [11]:
stopwords = stopwords.words('english')
stopwords = stopwords + ['p','https','http','www','href']

## Define all functions for pre-processing

In [12]:
def text_preprocessing(data):
    
    # Get only valid data.
    data['Articles'] = data['Articles'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    
    # Remove all punctuations.
    data['Articles'] = data['Articles'].str.replace('[^\w\s]', ' ')
    
    # Remove all numbers and inbetween numbers.
    data['Articles'] = data['Articles'].str.replace('\w*\d\w*', ' ')
    
    # Remove all new line characters
    data['Articles'] = data['Articles'].str.replace('\s+', ' ')
    
    # Lower all values.
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(x.lower() for x in x.split())) 
    
    # Remove stop words.
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(x for x in x.split() if x not in stopwords))
    
    # Remove underscores if any - Added after EDA.    
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(x.replace('_', ' ') for x in x.split()))
    
    # Remove junk words with lenght less than 2.
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(x for x in x.split() if len(x)>2))
    
    # Remove all words that occure less tha 5 times in full data provided for cleaning.
    freq = pd.Series(' '.join(data['Articles']).split()).value_counts()
    rare_words = list(freq.index[freq.values < 5])
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(x for x in x.split() if x not in rare_words))
    
    # Stemm and lemmatize the words.
    st = LancasterStemmer()
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(st.stem(x) for x in x.split()))

    lt = WordNetLemmatizer()
    data['Articles'] = data['Articles'].apply(lambda x: ' '.join(lt.lemmatize(x) for x in x.split()))
    
    return data

## Process data considering important POS

In [13]:
# Work with POS.
# remove_tags = ['CC', 'DT', 'EX', 'FW', 'UH', 'IN', 'TO']
include_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
def remove_postags(list_words):
    pos_tags = pos_tag(list_words)
    list_words_new = []        
    for tags in pos_tags:
#         if tags[1] not in remove_tags:
        if tags[1] in include_tags:
            list_words_new.append(tags[0])
    return list_words_new

In [14]:
def pos_processing(data):
    data_processed = pd.DataFrame(columns=['URL','Articles'])
    for i in range(len(data)):
        paragraphs = data.iloc[i,1]
        
        list_words = nltk.word_tokenize(paragraphs)        
        list_words = remove_postags(list_words)

        data_processed = data_processed.append({'URL': data.iloc[i,0], \
                                        'Articles':' '.join(word for word in list_words if len(word) < 15)}, ignore_index=True)
#         print('Completed: {}'.format(i))

    return(data_processed)

## Define i/p o/p file name

In [36]:
raw_file_name = 'test_spam_raw.csv'
processed_file_name = 'test_spam_processed.csv'
processed_file_name_pos = 'test_spam_processed_pos.csv'

data = pd.read_csv(raw_file_name)
print('No of rows to be processed: {}'.format(data.shape[0]))
data.head()

No of rows to be processed: 16


Unnamed: 0,URL,Articles
0,http://www.giveitlove.com/hilarious-things-you...,"[<p>The sun, sand, and water — the beach is a ..."
1,https://www.joinhoney.com/g-blog/the-secret-on...,[<p>Whether you shop online a lot or just occa...
2,http://www.pastfactory.com/history/incredible-...,[<p>Did you know that Elvis invited himself to...
3,https://www.tiebreaker.com/toughest-hockey-pla...,"[<p>by</p>, <p>Hockey is arguably the toughest..."
4,https://ca.cdn.hearing-aid-advice.com/signia_f...,"[<p class=""steps-cta__heading"">Here's how you ..."


## Process raw data

In [37]:
data_processed = text_preprocessing(data)
data_processed_pos = pos_processing(data_processed)

In [38]:
print('Length b4 pos processing: {}'.format(len(data_processed['Articles'][0])))
data_processed['Articles'][0]

Length b4 pos processing: 199


'beach plac day good company beach lik peopl on ord beach without least on list thing peopl ahead see good beach sur bet find beach sur could get much ev least good sint look ev though lot saw nee com'

In [39]:
print('Length after pos processing: {}'.format(len(data_processed_pos['Articles'][0])))
data_processed_pos['Articles'][0]

Length after pos processing: 172


'beach plac day good company beach lik peopl ord beach least list thing peopl ahead see good beach sur bet find beach sur get much ev least good sint look ev lot saw nee com'

## save processed data

In [40]:
data_processed.to_csv(processed_file_name, index=False)
data_processed_pos.to_csv(processed_file_name_pos, index=False)