In [None]:
# Topics Covered:

In [4]:
# Imports
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# regular expression library can be used to clear text of special characters
import re
# need this for bag of words
from sklearn.feature_extraction.text import CountVectorizer
# need this for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
paragraph="""Narendra Damodardas Modi[a] (born 17 September 1950)[b] is an Indian politician who has served as the prime minister of India since 2014. Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister outside the Indian National Congress.[4]. Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at the age of eight. 
At the age of 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so. Modi became a full-time worker for the RSS in Gujarat in 1971. The RSS assigned him to the BJP in 1985 and he rose through the party hierarchy, becoming general secretary in 1998.[c] In 2001, Modi was appointed chief minister of Gujarat and elected to the legislative assembly soon after. His administration is considered complicit in the 2002 Gujarat riots,[d] and has been criticised for its management of the crisis. According to official records, a little over 1,000 people were killed, three-quarters of whom were Muslim; independent sources estimated 2,000 deaths, mostly Muslim.[13] A Special Investigation Team appointed by the Supreme Court of India in 2012 found no evidence to initiate prosecution proceedings against him.[e] While his policies as chief minister were credited for encouraging economic growth, his administration was criticised for failing to significantly improve health, poverty and education indices in the state.[f]"""

In [6]:
print(paragraph)

Narendra Damodardas Modi[a] (born 17 September 1950)[b] is an Indian politician who has served as the prime minister of India since 2014. Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi. He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation. He is the longest-serving prime minister outside the Indian National Congress.[4]. Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education. He was introduced to the RSS at the age of eight. 
At the age of 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so. Modi became a full-time worker for the RSS in Gujarat in 1971. The RSS assigned him to the BJP in 1985 and he rose through the party hierarchy, becoming general secretary in 1998.[c] 

In [10]:
# Tokenization
# process of breaking text into smaller units such as sentences or words. In this case, the code is focused on sentence tokenization
# it returns a sentance-tokenized copy of your text, using NLTK's recommended sentance.

# required by NLTK's sent_tokenize function
# NLTK's download() function checks if the resource is already available on your system. If the resource has already been downloaded and is present in the correct location, it won't download it again.
nltk.download('punkt_tab')

sentences = nltk.sent_tokenize(paragraph, language='english')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Hasaan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
print(type(sentences))
print(sentences)

<class 'list'>
['Narendra Damodardas Modi[a] (born 17 September 1950)[b] is an Indian politician who has served as the prime minister of India since 2014.', 'Modi was the chief minister of Gujarat from 2001 to 2014 and is the member of parliament (MP) for Varanasi.', 'He is a member of the Bharatiya Janata Party (BJP) and of the Rashtriya Swayamsevak Sangh (RSS), a right-wing Hindu nationalist paramilitary volunteer organisation.', 'He is the longest-serving prime minister outside the Indian National Congress.[4].', 'Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education.', 'He was introduced to the RSS at the age of eight.', 'At the age of 18, he was married to Jashodaben Modi, whom he abandoned soon after, only publicly acknowledging her four decades later when legally required to do so.', 'Modi became a full-time worker for the RSS in Gujarat in 1971.', 'The RSS assigned him to the BJP in 1985 and he rose through the party hierarchy,

In [14]:
# Stemming
# reduces a word to find the base root word 
# process of reducing words to their base word stem regardless of whether the word remains meaningful or not

# creating an object of the PorterStemmer class
stemmer=PorterStemmer()

In [16]:
# give it a word and it will convert to base root
print(stemmer.stem("going"))
print(stemmer.stem("facial"))
print(stemmer.stem("thinking"))
print(stemmer.stem("drinking"))
print(stemmer.stem("historical"))
print(stemmer.stem("goes"))

go
facial
think
drink
histor
goe


In [18]:
# lemmatiztion
# reduces a word to find a MEANINGFUL base root word 
# required by WordNetLemmatizer:
nltk.download('wordnet')

# creating an object of the WordNetLemmatizer class
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hasaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# give it a word and it will convert to meaningful base root
print(lemmatizer.lemmatize("going"))
print(lemmatizer.lemmatize("facial"))
print(lemmatizer.lemmatize("thinking"))
print(lemmatizer.lemmatize("drinking"))
print(lemmatizer.lemmatize("historical"))
print(lemmatizer.lemmatize("goes"))

going
facial
thinking
drinking
historical
go


In [26]:
# Clean up
# Clean up data from special characters

# initialize list to store our new corpus after we clean
corpus=[]

print(len(sentences))

for i in range (len(sentences)):
    # we want to replace all special characters
    # sub returns a string by replacing the leftmost character
    # ^ : this means other than
    # other than small a to small z and big A to big Z, replace with a blank character on our sentances[i]. then convert everyhting to lower case.
    textreview = re.sub('[^a-zA-Z]', ' ', sentences[i])
    textreview=textreview.lower()
    corpus.append(textreview)

print(corpus)

15
['narendra damodardas modi a   born    september       b  is an indian politician who has served as the prime minister of india since      ', 'modi was the chief minister of gujarat from      to      and is the member of parliament  mp  for varanasi ', 'he is a member of the bharatiya janata party  bjp  and of the rashtriya swayamsevak sangh  rss   a right wing hindu nationalist paramilitary volunteer organisation ', 'he is the longest serving prime minister outside the indian national congress     ', 'modi was born and raised in vadnagar in northeastern gujarat  where he completed his secondary education ', 'he was introduced to the rss at the age of eight ', 'at the age of     he was married to jashodaben modi  whom he abandoned soon after  only publicly acknowledging her four decades later when legally required to do so ', 'modi became a full time worker for the rss in gujarat in      ', 'the rss assigned him to the bjp in      and he rose through the party hierarchy  becoming ge

In [28]:
# Stop words

# to view english stopwords
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [30]:
# Apply preprocessing to our data

nltk.download('stopwords')

# tokenization(sentance => words), stopwords, stemming or lemmatization
# here, i directly refers to an element of the corpus list, which contains the cleaned sentences. So, each i is still a sentence (the processed version), and the loop will print each sentence with a 1-second delay between each one.
for i in corpus:
    words=nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):   #filters out stopwords and lemmatizes/stems the remaining words  
            # print(stemmer.stem(word))
            print(lemmatizer.lemmatize(word))


narendra
damodardas
modi
born
september
b
indian
politician
served
prime
minister
india
since
modi
chief
minister
gujarat
member
parliament
mp
varanasi
member
bharatiya
janata
party
bjp
rashtriya
swayamsevak
sangh
rss
right
wing
hindu
nationalist
paramilitary
volunteer
organisation
longest
serving
prime
minister
outside
indian
national
congress
modi
born
raised
vadnagar
northeastern
gujarat
completed
secondary
education
introduced
rss
age
eight
age
married
jashodaben
modi
abandoned
soon
publicly
acknowledging
four
decade
later
legally
required
modi
became
full
time
worker
rss
gujarat
rss
assigned
bjp
rose
party
hierarchy
becoming
general
secretary
c
modi
appointed
chief
minister
gujarat
elected
legislative
assembly
soon
administration
considered
complicit
gujarat
riot
criticised
management
crisis
according
official
record
little
people
killed
three
quarter
muslim
independent
source
estimated
death
mostly
muslim
special
investigation
team
appointed
supreme
court
india
found
evidence
ini

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hasaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Apply preprocessing to our data and update corpus
"""
processed_corpus = []  # Create a new list to store the updated data
for i in corpus:
    words = nltk.word_tokenize(i)
    # list comprehension - concise way to create a list using a single line of code.
    # the general syntax for list comprehension is: [expression for item in iterable if condition]
    filtered_words = [ lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    # ' '.join() is a string method that joins the elements of a list into a single string, with a space (' ') acting as the separator.
    processed_corpus.append(' '.join(filtered_words))  # Combine filtered words back into a sentence

# Optionally, assign it back to the original corpus
corpus = processed_corpus
"""

In [32]:
# Bag of words
# CountVectorizer is a class in the sklearn.feature_extraction.text module. It converts a collection of text documents into a matrix of token counts, commonly used in text processing tasks like feature extraction for machine learning models

# creating an object of the CountVectorizer class
cv=CountVectorizer()
cv=CountVectorizer(binary=True, ngram_range=(3,3)) #if you want binary bag of words, or if you want to use ngrams: (3,3 - trigrams) (2,3 - bigrams and trigrams)

X=cv.fit_transform(corpus)

# shows the vacabulary and the index (feature number), not frequency
print(cv.vocabulary_)

# bag of words for sentance 2. if binary youll only have 1's and 0's. if not binary you can have 2,3,4,5,...
print(X[1].toarray())


{'narendra damodardas modi': 130, 'damodardas modi born': 41, 'modi born september': 123, 'born september is': 30, 'september is an': 174, 'is an indian': 100, 'an indian politician': 8, 'indian politician who': 95, 'politician who has': 154, 'who has served': 225, 'has served as': 66, 'served as the': 175, 'as the prime': 19, 'the prime minister': 194, 'prime minister of': 156, 'minister of india': 119, 'of india since': 138, 'modi was the': 126, 'was the chief': 218, 'the chief minister': 187, 'chief minister of': 32, 'minister of gujarat': 118, 'of gujarat from': 135, 'gujarat from to': 62, 'from to and': 57, 'to and is': 203, 'and is the': 13, 'is the member': 104, 'the member of': 192, 'member of parliament': 116, 'of parliament mp': 139, 'parliament mp for': 149, 'mp for varanasi': 128, 'he is member': 69, 'is member of': 102, 'member of the': 117, 'of the bharatiya': 140, 'the bharatiya janata': 185, 'bharatiya janata party': 26, 'janata party bjp': 106, 'party bjp and': 150, 'b

In [34]:
# TFIDF

# cv=TfidfVectorizer()
# cv=TfidfVectorizer(ngram_range=(2,3))        #to get bigram and trigram

cv=TfidfVectorizer(ngram_range=(2,3), max_features=3)        #there is also a max features parameter you can add, explained in your notes
  
X=cv.fit_transform(corpus)

print(corpus[0])

# tfidf for sentance 1
print(X[0].toarray())

narendra damodardas modi a   born    september       b  is an indian politician who has served as the prime minister of india since      
[[0. 1. 0.]]
