In [None]:
#10A
''' 
Aim: Parse a sentence and draw a tree using malt parsing.
Maltparser is a tool for dependency parsing.
It generate dependency trees for languages, leveraging ML.

For this project 
a. Java should be installed.
b. maltparser-1.7.2 zip file should be copied in
C:\Users\AppData\Local\Programs\Python\Python39 folder and should be
extracted in the same folder.
c. engmalt.linear-1.7.mco & engmalt.poly-1.7.mco file should be copied to
C:\Users\ AppData\Local\Programs\Python\Python39 folder
'''
from nltk.parse import malt
mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco')
#file t = mp.parse_one('I saw a bird from my window.'.split()).tree()
print(t)
t.draw()

In [1]:
#10B
''' 
Aim: Multiword Expressions in NLP.
Multiword Expressions (MWEs) are combinations of words that together convey a single
meaning or represent a specific concept
Eg. Strong coffee
'''
import nltk
from nltk.tokenize import MWETokenizer
from nltk import sent_tokenize, word_tokenize
# Download 'punkt_tab' for sentence tokenization
nltk.download('punkt_tab')
s = '''Good cake cost Rs.1500\kg in Hong Kong. Please buy me one of them.\n\nThanks.'''
mwe = MWETokenizer([('New', 'York'), ('Hong', 'Kong')],
separator='-')
for sent in sent_tokenize(s):
    print(mwe.tokenize(word_tokenize(sent)))

  s = '''Good cake cost Rs.1500\kg in Hong Kong. Please buy me one of them.\n\nThanks.'''


['Good', 'cake', 'cost', 'Rs.1500\\kg', 'in', 'Hong-Kong', '.']
['Please', 'buy', 'me', 'one', 'of', 'them', '.']
['Thanks', '.']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
#10C
''' 
Aim: Normalized Web Distance and Word Similarity. 
Normalized web distance is used to calclate the similarity or disimilarity between
two words or concepts by utilizing the vast amount of information.
It is based on the principle that the closer the two words are in meaning, the more likely
it is that they will co-occur in similar context.

Normalized Web Distance or text similarity, especially using 
Jaro-Winkler distance and agglomerative clustering.
'''

import numpy as np
import re
import textdistance
import sklearn
from sklearn.cluster import AgglomerativeClustering
texts = ['Reliance supermarket', 'Reliance hypermarket','Reliance', 'Reliance','Mumbai Hyper', 'Mumbai dxb','mumbai airport','k.m trading', 'KM Trading', 'KM trade', 'K.M.Trading', 'KM.Trading']

def normalize(text):
    return re.sub('[^a-z0-9]+', '',text.lower())

def group_texts(texts, threshold=0.4):
    normalized_texts = np.array([normalize(text) for text in texts])
    distances = 1 - np.array([[textdistance.jaro_winkler(one, another) for one in normalized_texts] for another in normalized_texts])

    clustering = AgglomerativeClustering(distance_threshold=threshold,metric='precomputed',linkage="complete",n_clusters=None).fit(distances)

    centers = dict()

    for cluster_id in set(clustering.labels_):
        index = clustering.labels_ == cluster_id
        centrality = distances[:, index][index].sum(axis=1)
        centers[cluster_id] = normalized_texts[index][centrality.argmin()]
    return [centers[i] for i in clustering.labels_]

l1 = []
for i in group_texts(texts):
    l1.append(str(i))

print(l1)

['reliance', 'reliance', 'reliance', 'reliance', 'mumbaihyper', 'mumbaihyper', 'mumbaihyper', 'kmtrading', 'kmtrading', 'kmtrading', 'kmtrading', 'kmtrading']


In [5]:
#10D
''' 
Aim: Word Sense Disambiguation.
It involves determining which sense of a word is being used in a particular context.
'''

import nltk
from nltk.corpus import wordnet as wn
# Download the 'wordnet' dataset before using it.
nltk.download('wordnet')
def get_first_sense(word, pos=None):
    if pos:
        synsets = wn.synsets(word,pos)
    else:
        synsets = wn.synsets(word)
    return synsets[0]

best_synset = get_first_sense('bank')
print ('%s: %s' % (best_synset.name, best_synset.definition))
best_synset = get_first_sense('set','n')
print ('%s: %s' % (best_synset.name, best_synset.definition))
best_synset = get_first_sense('set','v')
print ('%s: %s' % (best_synset.name, best_synset.definition))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<bound method Synset.name of Synset('bank.n.01')>: <bound method Synset.definition of Synset('bank.n.01')>
<bound method Synset.name of Synset('set.n.01')>: <bound method Synset.definition of Synset('set.n.01')>
<bound method Synset.name of Synset('put.v.01')>: <bound method Synset.definition of Synset('put.v.01')>
