### M2 Normalisation Exercise

In [None]:
import nltk
nltk.data.path.append("../pre_course/nltk_data")
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.collocations import *
import string
import stanza

#### Exercises
<br>

<ol>
  <li>Import the Hep Dataset and perform the following preprocessing steps to the "Text" column.</li>
    
        Lowercasing
        Remove Punctuation
        Tokenize
        Lemmatization
        Stemming
        Stopword Removal
        Remove Numbers
        Remove the words having length less than 2
        Tokenise, sentence splitting and lemmatisation using Stanza
 
</ol>

Guidelines: 

* Change the "Text" columnn from list to a string before undertaking pre-processing.  <br>
* Perform the preprocessing steps in the same way as done to the patent dataset abstract column. <br>
* Once punctuation removal, tokenisation, lemmatisation, stemming undertaken put the results in new columns in the df. <br> 
* Apply lemmatisation and stemming on text that has been tokenised <br> 
* Make a copy of the df once loaded in using copy() <br>
* Use Stanza on limited data 


### Lowercasing

In [None]:
hep = pd.read_pickle("../data/Hep_Dataset.pkl")

In [None]:
#keep a copy of the df 
hep2 = hep.copy()

In [None]:
# Function to change text column to string
def change_to_string(ptext):
    return " ".join(ptext)

In [None]:
hep['Text'] = hep['Text'].apply(change_to_string)

In [None]:
# Function to lowercase input data
def lowercase(ptext):
    return ptext.lower()

In [None]:
hep.loc[3,'Text']

In [None]:
# Apply lowercasing function to all text within a column
hep['Text'] = hep['Text'].apply(lowercase)

In [None]:
hep.loc[3,'Text']

In [None]:
#could have also lowercased like this
hep['Text'] = hep['Text'].str.lower()

### Remove Punctuation

In [None]:
def remove_punct1(ptext):
    for each_punctuation_mark in string.punctuation:
          ptext = ptext.replace(each_punctuation_mark, "")
    return ptext

In [None]:
hep.loc[0, 'Text']

In [None]:
hep['Text_Remove_Punct'] = hep['Text'].apply(remove_punct1)

In [None]:
hep.loc[3, 'Text_Remove_Punct']

### Tokenisation

In [None]:
def tokenisation(ptext):
    return nltk.word_tokenize(ptext)

In [None]:
# Apply to 'abstract' column in dataframe
hep['text_tokens'] = hep['Text'].apply(tokenisation)

In [None]:
hep.loc[3,'text_tokens']

### Sentence Tokenisation

In [None]:
# Function to tokenize each string
def tokens_sentences(ptext):
    return nltk.sent_tokenize(ptext)

In [None]:
# Apply tokenisation to abstract column in dataframe
hep['text_sentences'] = hep['Text'].apply(tokens_sentences)

In [None]:
hep.loc[3,'Text']

In [None]:
hep.loc[3,'text_sentences']

### Lemmatisation

In [None]:
# Define the lemmatize() function

def lemmatise(ptokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in ptokens]

In [None]:
hep.loc[7,'text_tokens']

In [None]:
hep['tokens_lemmatised'] = hep['text_tokens'].apply(lemmatise)

In [None]:
hep.loc[7,'tokens_lemmatised']

### Stemming

In [None]:
# Define stemming function

def stemming(ptoken):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in ptoken]    

In [None]:
hep.loc[7,'text_tokens']

In [None]:
hep['tokens_stemmed'] = hep['text_tokens'].apply(stemming)

In [None]:
hep.loc[7,'tokens_stemmed']

### Stopword Removal

In [None]:
# Define a function to remove stopwords from list of tokens

def clean_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [item for item in tokens if item not in stop_words]

In [None]:
hep.loc[7,'text_tokens']

In [None]:
hep['text_tokens']=hep['text_tokens'].apply(clean_stopwords)

In [None]:
hep.loc[7,'text_tokens']

### Remove Numbers

In [None]:
def remove_num(ptokens):
    return [token for token in ptokens if token.isalpha()]

In [None]:
#or

# function to remove numbers
def remove_num2(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)


In [None]:
hep.loc[747,'text_tokens']

In [None]:
hep['text_tokens'] = hep['text_tokens'].apply(remove_num)

In [None]:
hep.loc[747,'text_tokens']

### Remove the words having length less than 2

In [None]:
def remove_short_tokens(ptokens):
    return [token for token in ptokens if len(token) > 2]

In [None]:
hep['text_tokens'] = hep['text_tokens'].apply(remove_short_tokens)

In [None]:
hep.loc[707,'text_tokens']

### Tokenise, sentence splitting and lemmatisation using Stanza

In [None]:

stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline


In [None]:
doc = nlp(hep['Text'][4]) # run annotation over a sentence

In [None]:
print(doc)

In [None]:
# Tokenisation and sentence splitting

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlp(hep['Text'][5])

for index, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {index+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

In [None]:
# Lematisation

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
doc = nlp(hep['Text'][53])
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')