In [1]:
#Sample data 1
paragraph =  """Thank you all so very much. Thank you to the Academy. 
               Thank you to all of you in this room. I have to congratulate 
               the other incredible nominees this year. The Revenant was 
               the product of the tireless efforts of an unbelievable cast
               and crew. First off, to my brother in this endeavor, Mr. Tom 
               Hardy. Tom, your talent on screen can only be surpassed by 
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience. Thank you to everybody at 
               Fox and New Regency … my entire team. I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you. And to my 
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

In [2]:
# Sample data 2
text = """Global warming or climate change has become a worldwide concern. It is gradually developing into an unprecedented environmental crisis evident in melting glaciers, changing weather patterns, rising sea levels, floods, cyclones and droughts. Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth’s atmosphere."""

In [3]:
#Imports
import nltk
import re
import numpy as np

# TF(Term frequency) & IDF (Inverse Document Frequency)

TF - Term Frequency

IDF - Inverse Document Frequency

TF-IDF = TF * IDF

##### Term Frequency (TF) = Number of occurrences of a word in a document / Number of words in that document

##### Inverse Document (IDF) = log_e (Total Number of documents / Number of documents containing word) 

In [4]:
dataset = nltk.sent_tokenize(paragraph)

In [5]:
len(dataset)

21

In [6]:
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W'," ",dataset[i])
    dataset[i] = re.sub(r'\s+'," ",dataset[i])

In [7]:
#Word count 
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys(): word2count[word] = 1
        else: word2count[word] += 1

In [8]:
#TF Matix
tf_matrix = {}
for word in word2count:
    doc_tf = list()
    for data in dataset: 
        frequency = 0
        for w in nltk.word_tokenize(data):
            if word == w: frequency += 1
                
        tf_word = round(frequency/len(nltk.word_tokenize(data)),3)
        doc_tf.append(tf_word)
    tf_matrix[word] = doc_tf

In [9]:
#IDF
word_idfs = {}
for word in word2count:
    doc_count = 0
    for data in dataset:
        if word in nltk.word_tokenize(data): doc_count +=1
    word_idfs[word] = round(np.log(len(dataset)/doc_count),3)

In [10]:
#TF-IDF

tfidf_matrix = []

for word in tf_matrix.keys():
    tfidf = []
    
    for value in tf_matrix[word]:
        score = value * word_idfs[word]
        tfidf.append(score)
        
    tfidf_matrix.append(tfidf)

In [11]:
np.array(tfidf_matrix)

array([[0.161155, 0.193   , 0.107115, ..., 0.      , 0.      , 0.193   ],
       [0.141449, 0.1694  , 0.188034, ..., 0.      , 0.      , 0.1694  ],
       [0.276886, 0.      , 0.184038, ..., 0.      , 0.      , 0.      ],
       ...,
       [0.      , 0.      , 0.      , ..., 0.380625, 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.293875, 0.336193, 0.      ],
       [0.      , 0.      , 0.      , ..., 0.293875, 0.336193, 0.      ]])

# N-Gram Modelling - Character Grams

In [12]:
text

'Global warming or climate change has become a worldwide concern. It is gradually developing into an unprecedented environmental crisis evident in melting glaciers, changing weather patterns, rising sea levels, floods, cyclones and droughts. Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth’s atmosphere.'

In [13]:
import random

In [30]:
n = 5

In [31]:
ngrams = dict()

In [32]:
# model creation
for i in range(len(text)-n):
    gram = text[i:i+n]
    if gram not in ngrams.keys():
        ngrams[gram] = list()
    ngrams[gram].append(text[i+n])

In [35]:
ngrams

{'Globa': ['l', 'l'],
 'lobal': [' ', ' '],
 'obal ': ['w', 'w'],
 'bal w': ['a', 'a'],
 'al wa': ['r', 'r'],
 'l war': ['m', 'm'],
 ' warm': ['i', 'i'],
 'warmi': ['n', 'n'],
 'armin': ['g', 'g'],
 'rming': [' ', ' '],
 'ming ': ['o', 'i'],
 'ing o': ['r'],
 'ng or': [' '],
 'g or ': ['c'],
 ' or c': ['l'],
 'or cl': ['i'],
 'r cli': ['m'],
 ' clim': ['a'],
 'clima': ['t'],
 'limat': ['e'],
 'imate': [' '],
 'mate ': ['c'],
 'ate c': ['h'],
 'te ch': ['a'],
 'e cha': ['n'],
 ' chan': ['g', 'g'],
 'chang': ['e', 'i'],
 'hange': [' '],
 'ange ': ['h'],
 'nge h': ['a'],
 'ge ha': ['s'],
 'e has': [' '],
 ' has ': ['b'],
 'has b': ['e'],
 'as be': ['c'],
 's bec': ['o'],
 ' beco': ['m'],
 'becom': ['e'],
 'ecome': [' '],
 'come ': ['a'],
 'ome a': [' '],
 'me a ': ['w'],
 'e a w': ['o'],
 ' a wo': ['r'],
 'a wor': ['l'],
 ' worl': ['d'],
 'world': ['w'],
 'orldw': ['i'],
 'rldwi': ['d'],
 'ldwid': ['e'],
 'dwide': [' '],
 'wide ': ['c'],
 'ide c': ['o'],
 'de co': ['n'],
 'e con': ['c'],


In [24]:
ngrams['Glo']

['b', 'b']

In [33]:
# Test model
currentgram = text[0:n]
result = currentgram
for i in range(100):
    if currentgram not in ngrams.keys():break
    possibilities = ngrams[currentgram]
    nextitem = possibilities[random.randrange(len(possibilities))]
    result += nextitem
    currentgram = result[len(result)-n : len(result)]

In [21]:
text

'Global warming or climate change has become a worldwide concern. It is gradually developing into an unprecedented environmental crisis evident in melting glaciers, changing weather patterns, rising sea levels, floods, cyclones and droughts. Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth’s atmosphere.'

In [23]:
# trigram result
result

'Global warming in melting sea levels, changing into environment of the a worldwidental crisis gradually'

In [29]:
# bigram result
result

'Gloods, crisising sea levelonmeltincreento an to aver cris beconcedevern. It of greent of grapmeltin t'

In [34]:
# 5gram result
result

'Global warming or climate changing weather patterns, rising sea levels, floods, cyclones an increase in t'

# N-Gram Modelling - Word Grams

In [84]:
n = 3

In [85]:
ngrams = dict()

In [86]:
# model creation
words = nltk.word_tokenize(text)
for i in range(len(words)-n):
    gram = " ".join(words[i:i+n])
    if gram not in ngrams.keys(): ngrams[gram] = list()
    ngrams[gram].append(words[i+n])

In [73]:
#ngrams

In [87]:
# Test model
currentgram = " ".join(words[0:n])
result = currentgram
for i in range(30):
    if currentgram not in ngrams.keys():break
    possibilities = ngrams[currentgram]
    nextitem = possibilities[random.randrange(len(possibilities))]
    result += " " + nextitem
    rwords = nltk.word_tokenize(result)
    currentgram = " ".join(rwords[len(rwords)-n : len(rwords)])

In [83]:
# 2-grams
result

'Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth ’ s atmosphere .'

In [88]:
# 3-grams
result

'Global warming or climate change has become a worldwide concern . It is gradually developing into an unprecedented environmental crisis evident in melting glaciers , changing weather patterns , rising sea levels ,'

In [78]:
# 5-grams
result

'Global warming or climate change has become a worldwide concern . It is gradually developing into an unprecedented environmental crisis evident in melting glaciers , changing weather patterns , rising sea levels , floods ,'

In [66]:
text

'Global warming or climate change has become a worldwide concern. It is gradually developing into an unprecedented environmental crisis evident in melting glaciers, changing weather patterns, rising sea levels, floods, cyclones and droughts. Global warming implies an increase in the average temperature of the Earth due to entrapment of greenhouse gases in the earth’s atmosphere.'

# Latent Semantic Analysis (LSA)

In [89]:
# Sample Data
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [91]:
dataset = [line.lower() for line in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [92]:
# TF-IDF model
vectorizer = TfidfVectorizer()

In [93]:
x = vectorizer.fit_transform(dataset)

In [94]:
print(x[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [95]:
# create SVD
lsa = TruncatedSVD(n_components=4)
lsa.fit(x)

TruncatedSVD(n_components=4)

In [96]:
# v column
row1 = lsa.components_[1]

In [97]:
print(row1)

[-1.80743110e-15  7.39829241e-16 -9.71776018e-18  2.17306447e-01
 -1.86564100e-15  3.90541693e-17  2.17306447e-01  2.45183517e-15
  2.83591658e-01  9.68596258e-17 -1.85984416e-15  2.17306447e-01
 -1.99852854e-15  2.83591658e-01  2.19984735e-16  2.17306447e-01
  7.99745874e-16  4.84298129e-17 -1.99852854e-15 -1.63956252e-15
  2.13616736e-15  2.17306447e-01  2.83591658e-01 -1.99852854e-15
 -7.10537403e-16  2.17306447e-01  4.84298129e-17 -1.85984416e-15
  2.83591658e-01 -1.85984416e-15  2.83591658e-01  7.99745874e-16
  2.17306447e-01 -3.20007555e-15  2.11233236e-15  4.15788444e-01
 -1.85984416e-15  7.99745874e-16  2.17306447e-01  2.13616736e-15
  2.17306447e-01  7.99745874e-16]


In [101]:
# visulaize
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    componentsTerms = zip(terms,comp)
    sortedTerms = sorted(componentsTerms, key = lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
    print("\n Topic", i, ":")
    for term in sortedTerms: print(term)


 Topic 0 :
('the', 0.3760982952926374)
('concert', 0.34498873923306583)
('great', 0.30012402589487364)
('of', 0.29579806095266686)
('just', 0.2373658292979121)
('was', 0.2373658292979121)
('day', 0.22892159541504575)
('technology', 0.18383834567413393)
('all', 0.17824025175628952)
('in', 0.17824025175628952)

 Topic 1 :
('to', 0.41578844396700687)
('cook', 0.2835916579351071)
('gordon', 0.2835916579351071)
('love', 0.2835916579351071)
('ramsay', 0.2835916579351071)
('see', 0.2835916579351071)
('and', 0.21730644711292488)
('campaigns', 0.21730644711292477)
('global', 0.21730644711292477)
('have', 0.21730644711292477)

 Topic 2 :
('technology', 0.37791806767144004)
('is', 0.34196143806319856)
('google', 0.3413969441909747)
('introducing', 0.3413969441909747)
('new', 0.3413969441909747)
('day', 0.14112432680994705)
('examples', 0.1138789219537302)
('present', 0.1138789219537302)
('robots', 0.1138789219537302)
('today', 0.1138789219537302)

 Topic 3 :
('day', 0.4654267679041109)
('by', 0.

In [104]:
# visulaize
concept_words = {}
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    componentsTerms = zip(terms,comp)
    sortedTerms = sorted(componentsTerms, key = lambda x: x[1], reverse=True)
    sortedTerms = sortedTerms[:10]
#    print("\n Topic", i, ":")
#    for term in sortedTerms: print(term)
    concept_words["Concept "+ str(i)] = sortedTerms

In [106]:
#concept_words

In [107]:
# sentence topics or concepts
for key in concept_words.keys():
    sentence_scores = list()
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0] : score += word_with_score[1]
        sentence_scores.append(score)
        
    print(f"\n {key} :")
    for sentence_score in sentence_scores : print(sentence_score)


 Concept 0 :
1.1297395470753957
1.4959427190164012
0
0.18383834567413393
0.7797604325216745
1.373365598990949
0

 Concept 1 :
0
0
1.8337467336425424
0
0
0
1.285014232418706

 Concept 2 :
0.6242100916830926
0
0
1.7440703383075629
0.8334337554863608
0
0

 Concept 3 :
2.2015937554478873
0.12724213180694313
0
0.21264455202450064
0
0.2965820743887385
0
