<a href="https://colab.research.google.com/github/aniqurahman/FakeNewsMS/blob/main/FakeNews_TopicExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk import ne_chunk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag

import re
import numpy as np
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
##Preprocessing stage
#Step-1: Tokenization
orignal_text = "ISLAMABAD: Pakistan is likely to remain on the so-called grey list of the Financial Action Task Force (FATF) for another four months — i.e. until June — for a couple of unmet targets under the additional criteria. The concluding session of the plenary meeting of the FATF, a Paris-based global money laundering and terrorist financing watchdog, is due on Friday (today) and includes Pakistan’s review on the agenda. Pakistan is now targeting the full completion of the 2021 action plan on anti-money laundering and combating terror financing (AML/CFT) by the end of January 2023. Pakistan has been on the grey list for deficiencies in its counter-terror financing and anti-money laundering regimes since June 2018."

print("Oringal Text: \t\t",orignal_text)
tokenizer = RegexpTokenizer(r'\w+')                                             #tokenize and remove puntuations
tokenized_text=tokenizer.tokenize(orignal_text)
print("Tokenized & RE Removal: ", tokenized_text)

#Step-2: Stop word removal
stop_words = set(stopwords.words('english'))                                    #make a dictionary of stopwords
stop_filtered_sentence = [w for w in tokenized_text if not w.lower() in stop_words]
stop_filtered_sentence = []                                                     #words after removing stop words

for w in tokenized_text:                                                        #remove all the stop words based on given dictionary
  if w not in stop_words:
    stop_filtered_sentence.append(w)
    
print("Stop Words Removed: \t",stop_filtered_sentence)

#Step-3: Stemming
stemmer=PorterStemmer()
stemmed_sentence=[]
for _word in stop_filtered_sentence:
  stemmed_sentence.append(stemmer.stem(_word))
print("Stemmed: \t\t",stemmed_sentence)

#Step-4: lemmatization 
lemmatizer=WordNetLemmatizer()

#Step-4: Option 1: Simplified Lemmatization technique, with less accuracy


lemmatized_sentence = []
for _word in stemmed_sentence:
  lemmatized_sentence.append(lemmatizer.lemmatize(_word, pos ="a"))
  # lemmatized_sentence.append(lemmatizer.lemmatize(_word, pos ="v"))           #Lemitization w.r.t Part
print("Lematized, Option1: \t\t",lemmatized_sentence,"\n")


#Step-4: Option 2: POS based Lemmatization, with better accuracy

# Define function to lemmatize each word with its POS tag
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):                                                # Handles Adjectives
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):                                              # Handles Verbs
        return wordnet.VERB
    elif nltk_tag.startswith('N'):                                              # Handles Nouns      
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):                                              # Handles Adverbs
        return wordnet.ADV
    else:                                                                       # Excluding all other types of POS
        return None                                                             


# Find the POS tag for each token
# pos_tagged_sentence = nltk.pos_tag(stemmed_sentence) 
pos_tagged_sentence = nltk.pos_tag(stop_filtered_sentence)                            # Better results when stemming is skipped

# print("POS Tagged: \t\t",pos_tagged_sentence)

# used own pos_tagger function to make things simpler to understand.
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged_sentence))
# print("POS Simplified Tagged: \t",pos_tagged_sentence)

pos_lemmatized_sentence = []
for word, tag in wordnet_tagged:
    if tag is not None:    
        pos_lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))

print("Lematized, Option 2: \t\t",pos_lemmatized_sentence)

#Step-4 (extra): Stemming after Lemmatization
stemmer=PorterStemmer()
stemmed_after_lematization=[]
for _word in pos_lemmatized_sentence:
  stemmed_after_lematization.append(stemmer.stem(_word))
print("Stemmed on Lemzatized (Alternate Approach): \t",stemmed_after_lematization)

#step-4: name entity recognition
# pos_sentence = nltk.pos_tag(lemmatized_sentence)
# name entity recognition left for you
# named_entities = ne_chunk(pos_sentence)

Oringal Text: 		 ISLAMABAD: Pakistan is likely to remain on the so-called grey list of the Financial Action Task Force (FATF) for another four months — i.e. until June — for a couple of unmet targets under the additional criteria. The concluding session of the plenary meeting of the FATF, a Paris-based global money laundering and terrorist financing watchdog, is due on Friday (today) and includes Pakistan’s review on the agenda. Pakistan is now targeting the full completion of the 2021 action plan on anti-money laundering and combating terror financing (AML/CFT) by the end of January 2023. Pakistan has been on the grey list for deficiencies in its counter-terror financing and anti-money laundering regimes since June 2018.
Tokenized & RE Removal:  ['ISLAMABAD', 'Pakistan', 'is', 'likely', 'to', 'remain', 'on', 'the', 'so', 'called', 'grey', 'list', 'of', 'the', 'Financial', 'Action', 'Task', 'Force', 'FATF', 'for', 'another', 'four', 'months', 'i', 'e', 'until', 'June', 'for', 'a', 'cou