In [3]:
import nltk
from collections import Counter
import numpy as np
nltk.download("brown")
nltk.download("stopwords")
from string import punctuation
from nltk.corpus import brown
from nltk import bigrams
from nltk.probability import FreqDist
from nltk.corpus import stopwords


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Getting High frequency words
freq_of_words = Counter({})
freq_bi = Counter({})
stop_words = set(stopwords.words('english')+list(punctuation)+["``","''","--"])
biagrams = []
for c in brown.categories():
  words_wo_numbers = [w for w in brown.words(categories=c) if not w.isnumeric()]
  words_wo_stop_words = [w for w in words_wo_numbers if not w.lower() in stop_words]
  freq_of_words += Counter(words_wo_stop_words)
  freq_bi += Counter(FreqDist(bigrams(words_wo_stop_words)))

freq_of_words = dict(freq_of_words)

highest_freq_words = sorted(freq_of_words,key=freq_of_words.get,reverse=True)[:5000]
print("Most Common Unigrams : \n1.",highest_freq_words[0], "\n2.", highest_freq_words[1],"\n3.", highest_freq_words[2],"\n4.", highest_freq_words[3],"\n5.", highest_freq_words[4])
print("Least Common Unigrams : \n1.",highest_freq_words[-1], "\n2.", highest_freq_words[-2],"\n3.", highest_freq_words[-3],"\n4.", highest_freq_words[-4],"\n5.", highest_freq_words[-5])

Most Common Unigrams : 
1. one 
2. would 
3. said 
4. could 
5. time
Least Common Unigrams : 
1. Salem 
2. improvements 
3. strategic 
4. dated 
5. Paula


In [5]:
# New n words to be added to list of highest_freq_words
n_words = ['asylum','autograph','automobile','boy','bird','brother','cord','cushion','coast','cementary','crane','car','cock','fruit','furnace','forest','food','grin','graveyard','glass','gem','hill','implement','jewel','journey','lad','monk','midday','madhouse','mound','magician','noon','oracle','pillow','rooster','smile','string','sage','shore','serf','slave','signature','stove','tumbler','tool','voyage','wizard','woodland']
words_in_brown_corpus = [ w for w in n_words if w in freq_of_words ]

In [6]:
words=highest_freq_words+[w for w in words_in_brown_corpus if w not in highest_freq_words]

In [7]:
len(words)

5028

In [8]:
M1= np.zeros((len(words),len(words)))

In [9]:
# Constructing biagrams and calculating frequency
for i in range(len(words)):
  for j in range(len(words)):
    bigram=(words[i],words[j])
    M1[i][j] = freq_bi[(bigram)]

In [10]:
#PPMI
M1_plus = np.zeros_like(M1)
total_bigrams = np.sum(M1)
total_unigram_freq =0
for i,w1 in enumerate(words):
  total_unigram_freq+= freq_of_words[w1]

for i,w1 in enumerate(words):
  for j,w2 in enumerate(words):
    prob_wc = M1[i][j]/total_bigrams
    prob_w = freq_of_words[w1]/total_unigram_freq
    prob_c = freq_of_words[w2]/total_unigram_freq
    M1_plus[i][j] = max(np.log(prob_wc/(prob_c * prob_w)),0)

  M1_plus[i][j] = max(np.log(prob_wc/(prob_c * prob_w)),0)


In [11]:
#PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt

#Scaled data
M1_plus_scaled = preprocessing.scale(M1_plus.T)

pca10 = PCA(n_components=10, svd_solver='full')
pca10.fit(M1_plus_scaled)
M2_10 = pca10.transform(M1_plus_scaled)

In [12]:
pca100 = PCA(n_components=100, svd_solver='full')
pca100.fit(M1_plus_scaled)
M2_100 = pca100.transform(M1_plus_scaled)
#pca100_data[:5,:]

In [13]:
pca300 = PCA(n_components=300, svd_solver='full')
pca300.fit(M1_plus_scaled)
M2_300 = pca300.transform(M1_plus_scaled)

In [14]:
word_pair1=['cord','rooster','noon','fruit','autograph','automobile','mound','grin','graveyard','glass','boy','cushion','monk','coast','grin','shore','monk','boy','automobile','mound','lad','forest','food','shore','bird','coast','furnace','crane','hill','car','glass','magician','crane','brother','sage','oracle','bird','bird','food','brother','furnace','magician','hill','cord','glass','grin','journey','autograph','coast','forest','implement','cock','boy','cushion','automobile','midday','gem']
word_pair2=['smile','voyage','string','furnace','shore','wizard','stove','implement','madhouse','magician','rooster','jewel','slave','forest','lad','woodland','oracle','sage','cushion','shore','wizard','graveyard','rooster','voyage','woodland','hill','implement','rooster','woodland','journey','jewel','oracle','implement','lad','wizard','sage','crane','cock','fruit','monk','stove','wizard','mound','string','tumbler','smile','voyage','signature','shore','woodland','tool','rooster','lad','pillow','car','noon','jewel']
S_Manual=[0.02,0.04,0.04,0.05,0.06,0.11,0.14,0.18,0.42,0.44,0.44,0.45,0.57,0.85,0.88,0.9,0.91,0.96,0.97,0.97,0.99,1,1.09,1.22,1.24,1.26,1.37,1.41,1.48,1.55,1.78,1.82,2.37,2.41,2.46,2.61,2.63,2.63,2.69,2.74,3.11,3.21,3.29,3.41,3.45,3.46,3.58,3.59,3.6,3.65,3.66,3.68,3.82,3.84,3.92,3.94,3.94]

In [15]:
#Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

S_M1=[]
S_M1_plus=[]
S_M2_10=[]
S_M2_100=[]
S_M2_300=[]
for i in range(len(word_pair1)):
  w1 = word_pair1[i]
  w2 = word_pair2[i]
  index_w1 = words.index(w1)
  index_w2 = words.index(w2)

  S_M1.append(cosine_similarity(M1[index_w1:index_w1+1,:],M1[index_w2:index_w2+1,:])[0][0])
  S_M1_plus.append(cosine_similarity(M1_plus[index_w1:index_w1+1,:],M1_plus[index_w2:index_w2+1,:])[0][0])
  S_M2_10.append(cosine_similarity(M2_10[index_w1:index_w1+1,:],M2_10[index_w2:index_w2+1,:])[0][0])
  S_M2_100.append(cosine_similarity(M2_100[index_w1:index_w1+1,:],M2_100[index_w2:index_w2+1,:])[0][0])
  S_M2_300.append(cosine_similarity(M2_300[index_w1:index_w1+1,:],M2_300[index_w2:index_w2+1,:])[0][0])


In [16]:
#Pearson Correlation
from scipy import stats

P_M1 = stats.pearsonr(S_Manual, S_M1)
P_M1_plus = stats.pearsonr(S_Manual, S_M1_plus)
P_M2_10 = stats.pearsonr(S_Manual, S_M2_10)
P_M2_100 = stats.pearsonr(S_Manual, S_M2_100)
P_M2_300 = stats.pearsonr(S_Manual, S_M2_300)

In [17]:
print('The Pearson Correlations are as follows:')
print('M1: ',P_M1)
print('M1+: ',P_M1_plus)
print('M2_10: ',P_M2_10)
print('M2_100: ',P_M2_100)
print('M2_300: ',P_M2_300)

The Pearson Correlations are as follows:
M1:  PearsonRResult(statistic=0.21468858527181334, pvalue=0.1087767892623234)
M1+:  PearsonRResult(statistic=0.2491029397810784, pvalue=0.06167924069984078)
M2_10:  PearsonRResult(statistic=0.07014560147174698, pvalue=0.6041134458446543)
M2_100:  PearsonRResult(statistic=0.2197109300074495, pvalue=0.10055564792606486)
M2_300:  PearsonRResult(statistic=0.2549629828225948, pvalue=0.055614145541017346)
