In [None]:
from nltk.corpus import brown, stopwords
import string
from collections import Counter, defaultdict
import nltk
import math
import numpy as np

In [None]:
nltk.download('stopwords')
nltk.download('brown')

def process_words(word_list):
    # Convert everything to lowercase
    word_list = [word.lower() for word in word_list]

    # Remove punctuation
    word_list = [word.translate(str.maketrans('', '', string.punctuation)) for word in word_list]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_list = [word for word in word_list if word not in stop_words]

    # Remove empty character
    word_list = [word for word in word_list if word != '']

    # Count the occurrence of each word
    word_count = Counter(word_list)

    return word_count


word_count = process_words(brown.words())
VandCount = word_count.most_common()[:5000]
CandCount = word_count.most_common()[:1000]
V = [x[0] for x in VandCount]
C = [x[0] for x in CandCount]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
print(CandCount)

[('one', 3297), ('would', 2714), ('said', 1961), ('new', 1635), ('could', 1601), ('time', 1598), ('two', 1412), ('may', 1402), ('first', 1361), ('like', 1292), ('man', 1207), ('even', 1170), ('made', 1125), ('also', 1069), ('many', 1030), ('must', 1013), ('years', 1001), ('af', 996), ('back', 966), ('well', 961), ('much', 937), ('way', 909), ('people', 847), ('mr', 844), ('us', 838), ('little', 831), ('state', 807), ('good', 806), ('make', 794), ('world', 787), ('still', 782), ('see', 772), ('men', 763), ('work', 762), ('long', 753), ('get', 749), ('life', 715), ('never', 697), ('day', 687), ('another', 684), ('know', 683), ('last', 676), ('might', 672), ('great', 665), ('old', 661), ('year', 658), ('states', 650), ('come', 630), ('since', 628), ('go', 626), ('came', 622), ('right', 613), ('used', 611), ('take', 610), ('three', 610), ('house', 591), ('use', 591), ('without', 583), ('place', 570), ('american', 569), ('around', 562), ('however', 552), ('home', 547), ('1', 546), ('small',

In [None]:
m = 0
countWandC = defaultdict(lambda: defaultdict(int))
countW = defaultdict(int)
countC = defaultdict(int)
for sent in brown.sents():
    for i in range(len(sent)):
        for j in range(max(i-2, 0), min(i+2, len(sent)-1)):
            if(i != j and sent[i] in V and sent[j] in C):
                countWandC[sent[i]][sent[j]] += 1
                countW[sent[i]] += 1
                countC[sent[j]] += 1
                m += 1

In [None]:
embedding = {}

for w in V:
    vec = []
    for c in C:
        if(countW[w]==0 or countC[c]==0):
            vec.append(0)
        else:
            logArg = (countWandC[w][c]/countW[w])/(countC[c]/m)
            if(logArg==0):
                vec.append(0)
            else:
                vec.append(max(0, math.log( logArg )))

    embedding[w] = vec

In [None]:
from sklearn.decomposition import PCA

embMat = np.array([embedding[w] for w in embedding])
pcaMat = PCA(n_components=100).fit_transform(embMat)
pcaEmbeddings = {}

for i, w in enumerate(embedding):
    pcaEmbeddings[w] = pcaMat[i]

In [None]:
def cosDist(v1, v2):
    return 1 - np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

nearestNeighbor = {}

for w in V[:100]:
    wprime = ""
    dist = np.inf

    for wp in V:
        if(wp != w and cosDist(pcaEmbeddings[w], pcaEmbeddings[wp]) < dist):
            dist = cosDist(pcaEmbeddings[w], pcaEmbeddings[wp])
            wprime = wp

    nearestNeighbor[w] = wprime

print("Word \t Nearest Neighbour")
for word in nearestNeighbor:
    print(f"{word} \t {nearestNeighbor[word]}")

Word 	 Nearest Neighbour
one 	 every
would 	 could
said 	 told
new 	 modern
could 	 would
time 	 period
two 	 three
may 	 would
first 	 second
like 	 around
man 	 boy
even 	 much
made 	 become
also 	 always
many 	 two
must 	 might
years 	 days
af 	 american
back 	 around
well 	 also
much 	 even
way 	 us
people 	 men
mr 	 af
us 	 would
little 	 much
state 	 local
good 	 real
make 	 take
world 	 war
still 	 always
see 	 tell
men 	 people
work 	 still
long 	 next
get 	 go
life 	 experience
never 	 ever
day 	 week
another 	 one
know 	 remember
last 	 next
might 	 must
great 	 major
old 	 young
year 	 week
states 	 parts
come 	 came
since 	 well
go 	 get
came 	 went
right 	 left
used 	 found
take 	 see
three 	 two
house 	 room
use 	 find
without 	 even
place 	 step
american 	 af
around 	 back
however 	 became
home 	 back
1 	 3
small 	 little
found 	 come
mrs 	 af
thought 	 seemed
went 	 came
say 	 tell
part 	 role
general 	 social
high 	 higher
upon 	 along
school 	 college
every 	 one
dont

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Sample data
data = np.array([pcaEmbeddings[w] for w in pcaEmbeddings])

# Create a KMeans object with the desired number of clusters
kmeans = KMeans(n_clusters=100)

# Fit the KMeans model to the data
kmeans.fit(data)

# Retrieve the cluster labels and cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

# Print the cluster labels and cluster centers
print("Cluster Labels:", labels)



Cluster Labels: [39 88 20 ... 54  9 98]


In [None]:
wordList = [w for w in pcaEmbeddings]

In [None]:
clusters = defaultdict(list)

for i in range(len(labels)):
    clusters[labels[i]].append(wordList[i])

for i in range(100):
    print("Cluster ", i, " :- ", clusters[i])

Cluster  0  :-  ['end']
Cluster  1  :-  ['considered', 'paid', 'carried', 'remained', 'raised', 'remembered', 'sight', 'index', 'picked', 'believed', 'failed', 'universe', 'rain', 'ordered', 'realized', 'directed', 'starting', 'flow', 'passing', 'dream', 'broken', 'keeping', 'liked', 'enjoyed', 'joined', 'regarded', 'wished', 'phone', 'faced', 'bitter', 'assigned', 'troops', 'breakfast', 'laughed', 'operator', 'grand', 'integration', 'denied', 'congregation', 'insurance', 'identified', 'offers', 'ice', 'liquor', 'brings', 'defined', 'handed', 'glad', 'innocent', 'wise', 'managed', 'nationalism', 'performed', 'jumped', 'disappeared', 'sending', 'tells', 'succeeded', 'rejected', 'attempted', 'namely', 'allowing', 'impressed', 'dawn', 'eager', 'scope', 'conceived', 'pointing', 'mount', 'stockholders', 'emerged', 'abandoned', 'accused', 'grateful', 'criminal', 'nearest', 'access', 'stupid', 'worn', 'exercises', 'expects', 'rhythm', '0', 'keeps', 'suspected', 'inclined', 'lowered', 'respect