<a href="https://colab.research.google.com/github/akshithagopagani/NLP/blob/main/Assignment-1_19k508.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
documentA = 'Text Simplification is the task of reducing the complexity of the vocabulary and sentence structure of text while retaining its original meaning, with the goal of improving readability and understanding.'
documentB = 'Sentiment Analysis is the process of determining whether a piece of writing is positive, negative or neutral. A sentiment analysis system for text analysis combines natural language processing (NLP) and machine learning techniques to assign weighted sentiment scores to the entities, topics, themes and categories within a sentence or phrase.'

In [20]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [21]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [22]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [23]:
import nltk
nltk.download('stopwords')
#stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Term Frequency (TF) 
The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [24]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [25]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(tfA)
print(tfB)

{'entities,': 0.0, 'themes': 0.0, 'within': 0.0, 'to': 0.0, 'language': 0.0, 'weighted': 0.0, 'Analysis': 0.0, 'scores': 0.0, 'writing': 0.0, 'topics,': 0.0, 'Sentiment': 0.0, 'determining': 0.0, 'phrase.': 0.0, 'text': 0.03333333333333333, 'original': 0.03333333333333333, 'of': 0.13333333333333333, 'sentence': 0.03333333333333333, 'readability': 0.03333333333333333, 'a': 0.0, 'goal': 0.03333333333333333, 'neutral.': 0.0, 'assign': 0.0, 'complexity': 0.03333333333333333, 'positive,': 0.0, 'process': 0.0, 'or': 0.0, 'processing': 0.0, 'whether': 0.0, '(NLP)': 0.0, 'combines': 0.0, 'while': 0.03333333333333333, 'categories': 0.0, 'natural': 0.0, 'meaning,': 0.03333333333333333, 'machine': 0.0, 'system': 0.0, 'and': 0.06666666666666667, 'reducing': 0.03333333333333333, 'vocabulary': 0.03333333333333333, 'improving': 0.03333333333333333, 'techniques': 0.0, 'learning': 0.0, 'Simplification': 0.03333333333333333, 'negative': 0.0, 'its': 0.03333333333333333, 'analysis': 0.0, 'understanding.':

# **Inverse Data Frequency (IDF)**
The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus. 

In [26]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [27]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'entities,': 0.6931471805599453, 'themes': 0.6931471805599453, 'within': 0.6931471805599453, 'to': 0.6931471805599453, 'language': 0.6931471805599453, 'weighted': 0.6931471805599453, 'Analysis': 0.6931471805599453, 'scores': 0.6931471805599453, 'writing': 0.6931471805599453, 'topics,': 0.6931471805599453, 'Sentiment': 0.6931471805599453, 'determining': 0.6931471805599453, 'phrase.': 0.6931471805599453, 'text': 0.0, 'original': 0.6931471805599453, 'of': 0.0, 'sentence': 0.0, 'readability': 0.6931471805599453, 'a': 0.6931471805599453, 'goal': 0.6931471805599453, 'neutral.': 0.6931471805599453, 'assign': 0.6931471805599453, 'complexity': 0.6931471805599453, 'positive,': 0.6931471805599453, 'process': 0.6931471805599453, 'or': 0.6931471805599453, 'processing': 0.6931471805599453, 'whether': 0.6931471805599453, '(NLP)': 0.6931471805599453, 'combines': 0.6931471805599453, 'while': 0.6931471805599453, 'categories': 0.6931471805599453, 'natural': 0.6931471805599453, 'meaning,': 0.693147180559

# **Lastly, the TF-IDF is simply the TF multiplied by IDF.**

In [28]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [29]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

   entities,    themes    within        to  language  weighted  Analysis  \
0   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1   0.013863  0.013863  0.013863  0.027726  0.013863  0.013863  0.013863   

     scores   writing   topics,  ...   is      task       for  the  sentiment  \
0  0.000000  0.000000  0.000000  ...  0.0  0.023105  0.000000  0.0   0.000000   
1  0.013863  0.013863  0.013863  ...  0.0  0.000000  0.013863  0.0   0.027726   

          A      with      Text  structure  retaining  
0  0.000000  0.023105  0.023105   0.023105   0.023105  
1  0.013863  0.000000  0.000000   0.000000   0.000000  

[2 rows x 58 columns]


In [30]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(df)

   analysis       and    assign  categories  combines  complexity  \
0  0.000000  0.233118  0.000000    0.000000  0.000000    0.163819   
1  0.380656  0.180560  0.126885    0.126885  0.126885    0.000000   

   determining  entities       for      goal  ...       to    topics  \
0     0.000000  0.000000  0.000000  0.163819  ...  0.00000  0.000000   
1     0.126885  0.126885  0.126885  0.000000  ...  0.25377  0.126885   

   understanding  vocabulary  weighted   whether     while      with  \
0       0.163819    0.163819  0.000000  0.000000  0.163819  0.163819   
1       0.000000    0.000000  0.126885  0.126885  0.000000  0.000000   

     within   writing  
0  0.000000  0.000000  
1  0.126885  0.126885  

[2 rows x 53 columns]




In [31]:
_model = "sentence-transformers/bert-base-nli-mean-tokens"
model = SentenceTransformer(_model)

In [32]:
def isSimilar(a,b):
  threshold = 0.9
  embeddings = model.encode([a, b])
  embeddings.shape
  res = list(cosine_similarity([embeddings[0]], embeddings[1:]))
  return res[0]
print(isSimilar(documentA, documentB))

[0.6463746]
