In [0]:
import pandas as pd
import numpy as np
import os
from pandas import DataFrame, read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB

In [0]:
# Extracting folder

import requests

filename = 'aclImdb_v1.tar.gz' 
url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename 
r = requests.get(url)
with open(filename, 'wb') as f: f.write(r.content)

#...extract zip file
import tarfile

tar = tarfile.open(filename, mode='r')
tar.extractall()
tar.close()

### Problem 1: Creating function for Precision, Recall and F1score

In [0]:
def get_precision(y_pred, y_test, debug = False):

    # deal with npdarray

    y_pred = list(y_pred)

    y_test = list(y_test)


    y_pred = list(map(int,[1 == l for l in y_pred]))# deal with None type

    y_test = list(map(int,[1 == l for l in y_test]))# deal with None type
    
    n = len(y_pred);

    true_positive = sum(y_pred[i]* y_test[i] for i in range(n))

    if (0 == sum(y_pred)): return 0

    return true_positive*1.0/sum(y_pred)

In [0]:
def get_recall(y_pred, y_test):

    # deal with npdarray

    y_pred = list(y_pred)

    y_test = list(y_test)

    n = len(y_pred);

    y_pred = list(map(int,[1 == l for l in y_pred]))# deal with None type

    y_test = list(map(int,[1 == l for l in y_test]))# deal with None type

    true_positive = sum(y_pred[i]*y_test[i] for i in range(n))

    if 0 == sum(y_test): return 0

    return true_positive*1.0/sum(y_test)

In [0]:
def get_fscore(y_pred, y_test):


    precision=get_precision(y_pred,y_test)

    recall=get_recall(y_pred,y_test)

    if precision==0 and recall==0:

        return 0

    fscore=2.0*precision*recall/(precision+recall)

    return fscore

In [0]:
# Extracting Data in train test files

imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir,'train')
test_dir = os.path.join(imdb_dir,'test')
labels = []
texts = []

test_labels = []
test_texts = []

In [0]:
# Tagging data as positive and negative in train data

for label_type in ['pos','neg']:
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname),encoding="utf8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [0]:
# Tagging data as positive and negative in test data

for label_type in ['pos','neg']:
    dir_name = os.path.join(test_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname),encoding="utf8")
            test_texts.append(f.read())
            f.close()
            if label_type == 'neg':
                test_labels.append(0)
            else:
                test_labels.append(1)

In [0]:
print(f'Length of texts is {len(texts)}')
print(f'Length of labels id {len(labels)}')
print(f'Length of test_texts is {len(test_texts)}')
print(f'Length of test_labels is {len(test_labels )}')

Length of texts is 25000
Length of labels id 25000
Length of test_texts is 25000
Length of test_labels is 25000


In [0]:
texts_df = pd.DataFrame({'texts': texts,
                        'labels':labels})

In [0]:
texts_df.head()

Unnamed: 0,texts,labels
0,(spoilers)<br /><br />I was blown away by this...,1
1,Joe Don Baker is one of a handful of actors wh...,1
2,I'd never seen an independent movie and I was ...,1
3,"Ok, at the beginning it looked like ""Shrek"" - ...",1
4,I fell in love with this silent action drama. ...,1


In [0]:
positive = texts_df[texts_df['labels']==1]['texts']
negative = texts_df[texts_df['labels']==0]['texts']

### Problem 2: Majority Class Baseline Model

In [0]:
X_train, X_test, y_train, y_test = train_test_split(texts_df.texts,texts_df.labels, test_size=0.3, random_state=0)
print(X_test.shape,y_test.shape,X_train.shape,y_train.shape)

(7500,) (7500,) (17500,) (17500,)


In [0]:
from sklearn.dummy import DummyClassifier       #this classifier selects the most frequent class in train data and fit on test data

dummy_majority = DummyClassifier(strategy='most_frequent',random_state=0)
dummy_majority.fit(X_train,y_train)

y_pred = dummy_majority.predict(X_test)

In [0]:
#using function
print("Precison:",get_precision(y_pred,y_test))
print("Recall:",get_recall(y_pred,y_test))
print("FScore:",get_fscore(y_pred,y_test))

Precison: 0
Recall: 0.0
FScore: 0


In [0]:
#Using sklearn
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred , average="macro"))

Precision: 0.25
Recall:  0.50
F1-score:  0.33


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.49773333333333336

### Problem 3: Review Length Baseline

In [0]:
texts_df['text_length'] = texts_df['texts'].str.split().str.len()  # creating new variable for text length

In [0]:
texts_df.head()

Unnamed: 0,texts,labels,text_length
0,(spoilers)<br /><br />I was blown away by this...,1,140
1,Joe Don Baker is one of a handful of actors wh...,1,254
2,I'd never seen an independent movie and I was ...,1,87
3,"Ok, at the beginning it looked like ""Shrek"" - ...",1,193
4,I fell in love with this silent action drama. ...,1,113


In [0]:
texts_df['text_length'].max()  # maximum length of review in data is 2470

2470

In [0]:
texts_df['text_length'].min()   #min length of review in data is 10

10

1. Setting threshold of length 100 words, 1 if length greater than 100 else 0 (Selected 100 just to check how model is performing)

In [0]:
texts_df['New_Labels'] = texts_df['text_length'].apply(lambda x: '1' if x>100 else '0')

In [0]:
texts_df.head()

Unnamed: 0,texts,labels,text_length,New_Labels
0,(spoilers)<br /><br />I was blown away by this...,1,140,1
1,Joe Don Baker is one of a handful of actors wh...,1,254,1
2,I'd never seen an independent movie and I was ...,1,87,0
3,"Ok, at the beginning it looked like ""Shrek"" - ...",1,193,1
4,I fell in love with this silent action drama. ...,1,113,1


In [0]:
## Splitting the data
X_train, X_test, y_train, y_test = train_test_split(texts_df.texts,texts_df.New_Labels, test_size=0.3, random_state=0)
print(X_test.shape,y_test.shape)

(7500,) (7500,)


In [0]:
Length_baseline1 = DummyClassifier(strategy='stratified',random_state=0)
Length_baseline1.fit(X_train,y_train)

y_pred = Length_baseline1.predict(X_test)

In [0]:
# Evaluation with pre-defined function
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred , average="macro"))

Precision: 0.50
Recall:  0.50
F1-score:  0.50


In [0]:
print("Precison:",get_precision(y_pred,y_test))
print("Recall:",get_recall(y_pred,y_test))
print("FScore:",get_fscore(y_pred,y_test))

Precison: 0
Recall: 0
FScore: 0


2. Setting threshold of length 500 words, 1 if length greater than 500 else 0 (Choosing 500 because disappointed people usually write a big review)

In [0]:
texts_df['New_Labels'] = texts_df['text_length'].apply(lambda x: '1' if x>500 else '0')

In [0]:
texts_df.head()

Unnamed: 0,texts,labels,text_length,New_Labels
0,(spoilers)<br /><br />I was blown away by this...,1,140,0
1,Joe Don Baker is one of a handful of actors wh...,1,254,0
2,I'd never seen an independent movie and I was ...,1,87,0
3,"Ok, at the beginning it looked like ""Shrek"" - ...",1,193,0
4,I fell in love with this silent action drama. ...,1,113,0


In [0]:
## Splitting the data
X_train, X_test, y_train, y_test = train_test_split(texts_df.texts,texts_df.New_Labels, test_size=0.3, random_state=0)
print(X_test.shape,y_test.shape)

(7500,) (7500,)


In [0]:
Length_baseline2 = DummyClassifier(strategy='uniform',random_state=0)
Length_baseline2.fit(X_train,y_train)

y_pred = Length_baseline2.predict(X_test)

In [0]:
Evaluation with pre-defined function
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred , average="macro"))

Precision: 0.50
Recall:  0.49
F1-score:  0.39


In [0]:
print("Precison:",get_precision(y_pred,y_test))
print("Recall:",get_recall(y_pred,y_test))
print("FScore:",get_fscore(y_pred,y_test))

Precison: 0
Recall: 0
FScore: 0


3. Setting threshold of length 15 words, 1 if length greater than 15 else 0 (Choosing 15 words because people sometimes write a review short)

In [0]:
texts_df['New_Labels'] = texts_df['text_length'].apply(lambda x: '1' if x>15 else '0')

In [0]:
## Splitting the data
X_train, X_test, y_train, y_test = train_test_split(texts_df.texts,texts_df.New_Labels, test_size=0.3, random_state=0)
print(X_test.shape,y_test.shape)

(7500,) (7500,)


In [0]:
Length_baseline3 = DummyClassifier(strategy='uniform',random_state=0)
Length_baseline3.fit(X_train,y_train)

y_pred = Length_baseline3.predict(X_test)

In [0]:
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred , average="macro"))

Precision: 0.50
Recall:  0.49
F1-score:  0.39


In [0]:
print("Precison:",get_precision(y_pred,y_test))
print("Recall:",get_recall(y_pred,y_test))
print("FScore:",get_fscore(y_pred,y_test))

Precison: 0
Recall: 0
FScore: 0


### Problem 4: Naive Bayes

In [0]:
texts_df.head()

Unnamed: 0,texts,labels,text_length,New_Labels
0,(spoilers)<br /><br />I was blown away by this...,1,140,1
1,Joe Don Baker is one of a handful of actors wh...,1,254,1
2,I'd never seen an independent movie and I was ...,1,87,1
3,"Ok, at the beginning it looked like ""Shrek"" - ...",1,193,1
4,I fell in love with this silent action drama. ...,1,113,1


In [0]:
texts_df.shape

(25000, 4)

In [0]:
# using sample to run Naive Bayes because it fails on my laptop with memory error. Tried on colab, hadoop cloud but no luck.
import random
text_sample= texts_df.sample(n=10000,replace=False, random_state=None)
#data.iloc[0:5] # first five rows of dataframe

In [0]:
text_sample.shape

(10000, 4)

In [0]:
text_sample.head

<bound method NDFrame.head of                                                    texts  ...  New_Labels
12580  I got this movie in the $5 bin at walmart. I w...  ...           1
2454   Here's the kind of love story that I do enjoy ...  ...           1
21519  This movie was horrible. I swear they didn't e...  ...           1
24787  This film should have never been made. Honestl...  ...           1
9083   Having seen and loved Greg Lombardo's most rec...  ...           1
24810  C'mon guys some previous reviewers have nearly...  ...           1
15317  I won't add to the plot reviews, it's not very...  ...           1
9361   Made in 1946 and released in 1948, The Lady an...  ...           1
10956  I watched this movie a couple of days ago in a...  ...           1
6305   I'm surprised how many people give this move l...  ...           1
18232  The cover on the DVD and disc is freaking awes...  ...           1
16218  My wife and I just finished watching Bûsu AKA ...  ...           1
18719  G

In [0]:
# Define the documents
documents = text_sample['texts']
# Import the count vectorizer and initialize it
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()


In [0]:
documents.head(5)

4214     The man who directed 'The Third Man' also dire...
4031     I had pleasure to watch the short film "The Cu...
20933    Liongate has yet to prove itself. Every single...
4937     Well, I guess I'm emotionally attached to this...
2429     I have watched this movie well over 100-200 ti...
Name: texts, dtype: object

In [0]:
count_vector.fit(documents)
names = count_vector.get_feature_names()
names

['00',
 '000',
 '000s',
 '001',
 '003830',
 '007',
 '0080',
 '0083',
 '00am',
 '00pm',
 '00s',
 '01',
 '02',
 '020410',
 '03',
 '04',
 '05',
 '050',
 '06',
 '07',
 '08',
 '089',
 '08th',
 '09',
 '0s',
 '10',
 '100',
 '1000',
 '1000s',
 '1001',
 '100s',
 '100x',
 '100yards',
 '101',
 '101st',
 '102',
 '103',
 '104',
 '1040s',
 '105',
 '106',
 '107',
 '108',
 '109',
 '10p',
 '10s',
 '10th',
 '10x',
 '11',
 '110',
 '11001001',
 '112',
 '1138',
 '1146',
 '115',
 '116',
 '117',
 '11f',
 '11m',
 '11th',
 '12',
 '120',
 '1200',
 '1201',
 '1202',
 '123',
 '12383499143743701',
 '127',
 '128',
 '12m',
 '12mm',
 '12th',
 '13',
 '130',
 '1300',
 '1300s',
 '131',
 '1318',
 '134',
 '135',
 '135m',
 '138',
 '13k',
 '13s',
 '13th',
 '14',
 '140',
 '1408',
 '140hp',
 '142',
 '1454',
 '146',
 '1473',
 '14a',
 '14s',
 '14th',
 '14ème',
 '15',
 '150',
 '1500',
 '1500s',
 '150m',
 '1547',
 '156',
 '1561',
 '157',
 '158',
 '1594',
 '15mins',
 '15th',
 '16',
 '160',
 '1600',
 '1600s',
 '161',
 '1610',
 '163'

In [0]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
frequency_matrix = pd.DataFrame(data=doc_array, columns=names)
frequency_matrix.head(10)

Unnamed: 0,00,000,000s,001,003830,007,0080,0083,00am,00pm,00s,01,02,020410,03,04,05,050,06,07,08,089,08th,09,0s,10,100,1000,1000s,1001,100s,100x,100yards,101,101st,102,103,104,1040s,105,...,zsigmond,zu,zubeidaa,zucco,zucker,zuckerman,zucovic,zukhov,zukovic,zulu,zuni,zuniga,zuzz,zvezda,zvyagvatsev,zwick,zwrite,zx81,zy,zyuranger,zzzz,zzzzz,zzzzzzzz,zzzzzzzzzzzz,zé,álvaro,ánd,ángel,âme,äänekoski,écran,émigré,émigrés,était,état,étc,évery,ís,ísnt,über
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(texts_df['texts'], texts_df['labels'], random_state=1)
print('Number of rows in the total set: {}'.format(texts_df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 25000
Number of rows in the training set: 18750
Number of rows in the test set: 6250


In [0]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [0]:
naive_bayes = GaussianNB()
naive_bayes.fit(training_data.toarray(), y_train)
predictions = naive_bayes.predict(testing_data.toarray())

In [0]:
print("Precision: %0.2f" %precision_score(y_test, predictions , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, predictions , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, predictions , average="macro"))

Precision: 0.68
Recall:  0.68
F1-score:  0.67


### Problem 5 : Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(training_data.toarray(),y_train)
pred = LR.predict(testing_data.toarray())




In [0]:
print("Precision: %0.2f" %precision_score(y_test, pred , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, pred , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, pred , average="macro"))

Precision: 0.88
Recall:  0.88
F1-score:  0.88


### Problem 6: 
Performance is low when we model based on guess work or majority base  and based on baseline length model. That implies that machine learning model works more effective. This can be seen by Performance increases when we use Naive bayes or other classifier

### PRoblem 7: 

In [3]:
### using Pretained word vectors
!pip install mxnet
import mxnet
from mxnet import nd
from mxnet.contrib import text

text.embedding.get_pretrained_file_names().keys()

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/92/6c/c6e5562f8face683cec73f5d4d74a58f8572c0595d54f1fed9d923020bbd/mxnet-1.5.1.post0-py2.py3-none-manylinux1_x86_64.whl (25.4MB)
[K     |████████████████████████████████| 25.4MB 1.3MB/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Installing collected packages: graphviz, mxnet
  Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.5.1.post0


dict_keys(['glove', 'fasttext'])

In [4]:
print(text.embedding.get_pretrained_file_names('glove'))

['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.840B.300d.txt', 'glove.twitter.27B.25d.txt', 'glove.twitter.27B.50d.txt', 'glove.twitter.27B.100d.txt', 'glove.twitter.27B.200d.txt']


#### 1- Using "glove.6B.50d.txt" as pre-trained Glove embedding

In [5]:
glove_6b50d = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.50d.txt')

Downloading /root/.mxnet/embeddings/glove/glove.6B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.6B.zip...


In [0]:
def knn(W, x, k):
    # The added 1e-9 is for numerical stability
    cos = nd.dot(W, x.reshape((-1,))) / (
        (nd.sum(W * W, axis=1) + 1e-9).sqrt() * nd.sum(x * x).sqrt())
    topk = nd.topk(cos, k=k, ret_typ='indices').asnumpy().astype('int32')
    return topk, [cos[i].asscalar() for i in topk]

In [0]:

def get_similar_tokens(query_token, k, embed):
    topk, cos = knn(embed.idx_to_vec,
                    embed.get_vecs_by_tokens([query_token]), k+1)
    for i, c in zip(topk[1:], cos[1:]):  # Remove input words
        print('cosine sim=%.3f: %s' % (c, (embed.idx_to_token[i])))


In [0]:
def get_analogy(token_a, token_b, token_c, embed):
    vecs = embed.get_vecs_by_tokens([token_a, token_b, token_c])
    x = vecs[1] - vecs[0] + vecs[2]
    topk, cos = knn(embed.idx_to_vec, x, 1)
    return embed.idx_to_token[topk[0]] 
  

Evaluating embedding glove.6B.50d.txt

In [10]:
#capital-world
get_analogy('beijing', 'china', 'tokyo', glove_6b50d)  #capital- world is predicted correctly

'japan'

In [55]:
sim = glove_model300.n_similarity(['beijing', 'china'], ['tokyo', 'japan'])
print("{:.4f}".format(sim))

0.4909


  if np.issubdtype(vec.dtype, np.int):


In [27]:
#currency
get_analogy('algeria', 'dinar', 'japan', glove_6b50d)  #correct is "yen"

'japan'

In [56]:
sim = glove_model300.n_similarity(['algeria', 'dinar'], ['japan', 'japan'])
print("{:.4f}".format(sim))

0.1431


  if np.issubdtype(vec.dtype, np.int):


In [29]:
#city-in-state
get_analogy('chicago', 'illinois', 'nashville', glove_6b50d) #correct answer is "Tennessee"

'illinois'

In [57]:
sim = glove_model300.n_similarity(['chicago', 'illinois'], ['nashville', 'illinois'])
print("{:.4f}".format(sim))

0.8067


  if np.issubdtype(vec.dtype, np.int):


In [31]:
#family
get_analogy('boy ', 'girl', 'son', glove_6b50d) #correct answer is "daughter"

'mother'

In [58]:
sim = glove_model300.n_similarity(['boy', 'girl'], ['son', 'mother'])
print("{:.4f}".format(sim))

0.6299


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#gram1-adjective-to-adverb
get_analogy('apparent', 'apparently', 'cheerful', glove_6b50d) #correct answer is "cheerfully"


'cheerful'

In [59]:
sim = glove_model300.n_similarity(['apparent', 'apparently'], ['cheerful', 'cheerful'])
print("{:.4f}".format(sim))

0.0359


  if np.issubdtype(vec.dtype, np.int):


In [0]:
# gram2-opposite
get_analogy('aware', 'unaware', 'comfortable', glove_6b50d) #correct answer "uncomfortable"

'comfortable'

In [60]:
sim = glove_model300.n_similarity(['aware', 'unaware'], ['comfortable', 'comfortable'])
print("{:.4f}".format(sim))

0.2847


  if np.issubdtype(vec.dtype, np.int):


In [0]:
# gram3-comparative
get_analogy('bad', 'worse', 'easy', glove_6b50d) #correct answer "easier"

'easy'

In [61]:
sim = glove_model300.n_similarity(['bad', 'worse'], ['easy', 'easy'])
print("{:.4f}".format(sim))

0.3897


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#gram6nationality-adjective
get_analogy('belarus', 'belorussian', 'norway', glove_6b50d) #correct answer "Norwegian"

'dragoon'

In [62]:
sim = glove_model300.n_similarity(['belarus', 'belorussian'], ['norway', 'dragoon'])
print("{:.4f}".format(sim))

0.2253


  if np.issubdtype(vec.dtype, np.int):


#### 2- using glove.6B.100d.txt as pretrained embedding

In [0]:
glove_6b100d = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.100d.txt')

Evaluating embedding glove.6B.100d.txt

In [0]:
#capital-world
get_analogy('oslo', 'norway', 'cairo', glove_6b100d)  #capital- world is predicted correctly

'egypt'

In [63]:
sim = glove_model300.n_similarity(['oslo', 'norway'], ['cairo', 'egypt'])
print("{:.4f}".format(sim))

0.2967


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#currency
get_analogy('europe', 'euro', 'brazil', glove_6b100d)  #correct is "real"

'euro'

In [64]:
sim = glove_model300.n_similarity(['europe', 'euro'], ['brazil', 'euro'])
print("{:.4f}".format(sim))

0.7935


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#city-in-state
get_analogy('dallas', 'texas', 'atlanta', glove_6b100d) #correct answer is "Georgia"

'texas'

In [65]:
sim = glove_model300.n_similarity(['dallas', 'texas'], ['atlanta', 'texas'])
print("{:.4f}".format(sim))

0.8200


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#family
get_analogy('boy ', 'girl', 'dad', glove_6b100d) #correct answer is "mom"

'girl'

In [66]:
sim = glove_model300.n_similarity(['boy', 'girl'], ['dad', 'girl'])
print("{:.4f}".format(sim))

0.8737


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#gram1-adjective-to-adverb
get_analogy('free', 'freely', 'calm', glove_6b100d) #correct answer is "calmly"

'calm'

In [67]:
sim = glove_model300.n_similarity(['free', 'freely'], ['calm', 'calm'])
print("{:.4f}".format(sim))

0.2109


  if np.issubdtype(vec.dtype, np.int):


In [0]:
# gram2-opposite
get_analogy('likely', 'unlikely', 'sure', glove_6b100d) #correct answer "unsure"

'sure'

In [68]:
sim = glove_model300.n_similarity(['likely', 'unlikely'], ['sure', 'sure'])
print("{:.4f}".format(sim))

0.5304


  if np.issubdtype(vec.dtype, np.int):


In [0]:
# gram3-comparative
get_analogy('cheap', 'cheaper', 'old', glove_6b100d) #correct answer "older"

'old'

In [69]:
sim = glove_model300.n_similarity(['cheap', 'cheaper'], ['old', 'old'])
print("{:.4f}".format(sim))

0.1283


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#gram6nationality-adjective
get_analogy('chile', 'chilean', 'france', glove_6b100d) #correctly predicted

'french'

In [70]:
sim = glove_model300.n_similarity(['chile', 'chilean'], ['france', 'french'])
print("{:.4f}".format(sim))

0.2954


  if np.issubdtype(vec.dtype, np.int):


In [51]:
import gensim.downloader as api
glove_model300 = api.load('glove-wiki-gigaword-300')



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [52]:
 from gensim.test.utils import datapath
 analogy_scores = glove_model300.wv.evaluate_word_analogies(datapath('questions-words.txt'))

  
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


### Probelm 8

In [0]:
get_similar_tokens('increase', 10, glove_6b50d)

cosine sim=0.948: increases
cosine sim=0.948: increased
cosine sim=0.904: increasing
cosine sim=0.892: decrease
cosine sim=0.880: reduction
cosine sim=0.872: reducing
cosine sim=0.868: reduced
cosine sim=0.867: growth
cosine sim=0.865: reduce
cosine sim=0.865: higher


In [0]:
get_similar_tokens('white', 10, glove_6b50d)

cosine sim=0.906: black
cosine sim=0.874: green
cosine sim=0.861: gray
cosine sim=0.861: brown
cosine sim=0.823: blue
cosine sim=0.815: red
cosine sim=0.749: colored
cosine sim=0.743: orange
cosine sim=0.735: bright
cosine sim=0.731: dark


In [0]:
get_similar_tokens('negligible', 10, glove_6b50d)

cosine sim=0.801: insignificant
cosine sim=0.786: appreciable
cosine sim=0.768: decreasing
cosine sim=0.766: decrease
cosine sim=0.764: attributable
cosine sim=0.760: decreases
cosine sim=0.755: proportion
cosine sim=0.750: discernible
cosine sim=0.745: conversely
cosine sim=0.728: exceeds


The reason why Antonyms show up in top 10 similar words is because embedding is based on words that have similar context. Antonyms words have similar context too and that is why the opposite words show up in embedding.

### Probelm 9

Evaluation for glove.6B.50d.txt

In [0]:
#country capital
get_analogy('india', 'delhi', 'croatia', glove_6b50d) #correctly predicted

'zagreb'

In [72]:
sim = glove_model300.n_similarity(['india', 'delhi'], ['croatia', 'zagreb'])
print("{:.4f}".format(sim))

0.1677


  if np.issubdtype(vec.dtype, np.int):


In [0]:
# places-people
get_analogy('university', 'students', 'hospital', glove_6b50d)  #correct answer should be "patients"

'elderly'

In [73]:
sim = glove_model300.n_similarity(['university', 'students'], ['hospital', 'elderly'])
print("{:.4f}".format(sim))

0.3711


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#opposites
get_analogy('wet', 'dry', 'complete', glove_6b50d)   #correct answer should be "incomplete"

'complete'

In [74]:
sim = glove_model300.n_similarity(['wet', 'dry'], ['complete', 'complete'])
print("{:.4f}".format(sim))

0.0922


  if np.issubdtype(vec.dtype, np.int):


Evaluation for glove.6B.100d.txt

In [0]:
#country capital
get_analogy('greece', 'athens', 'malaysia', glove_6b100d) #correctly predicted

'kuala'

In [75]:
sim = glove_model300.n_similarity(['greece', 'athens'], ['malaysia', 'kuala'])
print("{:.4f}".format(sim))

0.1734


  if np.issubdtype(vec.dtype, np.int):


In [0]:
#places-people
get_analogy('school', 'students', 'zoo', glove_6b100d)  #correct answer should be "animals"

'zoo'

In [76]:
sim = glove_model300.n_similarity(['school', 'students'], ['zoo', 'zoo'])
print("{:.4f}".format(sim))

0.1316


  if np.issubdtype(vec.dtype, np.int):


In [0]:
get_analogy('burger', 'eat', 'juice', glove_6b100d)  #answer should be drink

'juice'

In [77]:
sim = glove_model300.n_similarity(['burger', 'eat'], ['juice', 'juice'])
print("{:.4f}".format(sim))

0.3305


  if np.issubdtype(vec.dtype, np.int):
