In [1]:
import re
import pprint
import nltk
nltk.download("brown"); nltk.download('universal_tagset')
from nltk.corpus import brown
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

[nltk_data] Downloading package brown to /home/jupyter/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
def pos_features(sentence, i, history): 
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [3]:
class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents, features=pos_features):
        self.features = features
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = self.features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [4]:
tagger = ConsecutivePosTagger(train_sents)
print(round(tagger.evaluate(test_sents), 4))

0.7915


## 1.1.1

In [5]:
def divide_data(dataset, train_test_validate):
    if len(train_test_validate) == 3:
        if sum(train_test_validate) == 1.:
            news_test = []; news_dev_test = []; news_train = []
            
            dataset_len = len(dataset)
            
            slicer = list(int(ttv * dataset_len) for ttv in train_test_validate)
            
            news_test     = dataset[:slicer[0]]
            news_dev_test = dataset[slicer[0]:slicer[0] + slicer[1]]
            news_train    = dataset[slicer[0] + slicer[1]:]

            return news_test, news_dev_test, news_train        
        
        else:
            print("train_test_validate must be of length 3 with the sum of it's elements eqyal to 1")
            return 
    else:
        print("train_test_validate must be of length 3 with the sum of it's elements eqyal to 1")
        return

tagged_sents_uni = brown.tagged_sents(categories='news', tagset='universal')
news_test, news_dev_test, news_train  = divide_data(tagged_sents_uni, [0.1,0.1, 0.8])

In [6]:
print(len(tagged_sents_uni))
print(len(news_test), len(news_dev_test), len(news_train))

4623
462 462 3699


In [7]:
tagger_task_1a = ConsecutivePosTagger(news_train)
print(round(tagger_task_1a.evaluate(news_dev_test), 4))

0.8689


## 1.1.2

I didn't quite manage this task but I think Unigram from chapter 5 of the nltk book needs to be used

# 1.2

In [8]:
import numpy as np
import sklearn

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


class ScikitConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents, 
                 features=pos_features, clf = BernoulliNB()):
        # Using pos_features as default.
        self.features = features
        train_features = []
        train_labels = []
        for tagged_sent in train_sents:
            history = []
            untagged_sent = nltk.tag.untag(tagged_sent)
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_features.append(featureset)
                train_labels.append(tag)
                history.append(tag)
        v = DictVectorizer()
        X_train = v.fit_transform(train_features)
        y_train = np.array(train_labels)
        clf.fit(X_train, y_train)
        self.classifier = clf
        self.dict = v

    def tag(self, sentence):
        test_features = []
        history = []
        for i, word in enumerate(sentence):
            featureset = self.features(sentence, i, history)
            test_features.append(featureset)
        X_test = self.dict.transform(test_features)
        tags = self.classifier.predict(X_test)
        return zip(sentence, tags)

## 1.2.1a

In [9]:
tagger_task_2a = ScikitConsecutivePosTagger(news_train)
print(round(tagger_task_2a.evaluate(news_dev_test), 4))

0.857


Close to what we got in last exercise

## 1.2.2b

In [10]:
alpha =  [1, 0.5, 0.1, 0.01, 0.001, 0.0001]
        
for element in alpha:
    clf = BernoulliNB(alpha=element)
    tagger_task_2b = ScikitConsecutivePosTagger(news_train, features = pos_features, clf = clf)
    print("Alpha:", element, " ", round(tagger_task_2b.evaluate(news_dev_test), 4))

Alpha: 1   0.857
Alpha: 0.5   0.8749
Alpha: 0.1   0.8695
Alpha: 0.01   0.8683
Alpha: 0.001   0.8651
Alpha: 0.0001   0.8631


## 1.2.3c

In [11]:
def improved_pos_features(sentence, i, history):
    """
    Improve pos_features to not only consider the previous word, but the word itself.
    """
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:],
                "suffix(4)": sentence[i]}       # Word
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [12]:
for element in alpha:
    clf = BernoulliNB(alpha=element)
    tagger_task_2b = ScikitConsecutivePosTagger(news_train, features = improved_pos_features, clf = clf)
    print("Alpha:", element, " ", round(tagger_task_2b.evaluate(news_dev_test), 4))

Alpha: 1   0.8874
Alpha: 0.5   0.9166
Alpha: 0.1   0.9244
Alpha: 0.01   0.9303
Alpha: 0.001   0.933
Alpha: 0.0001   0.934


## 1.3.1a

In [13]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 500)
tagger_task_3a = ScikitConsecutivePosTagger(news_train, features = improved_pos_features, clf = clf)
print(round(tagger_task_3a.evaluate(news_dev_test), 4))

0.9518


## 1.3.2b

In [14]:
new_alpha = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

for element in new_alpha:
    clf = LogisticRegression(C = element, max_iter = 1000)
    tagger_task_3b = ScikitConsecutivePosTagger(news_train, features = improved_pos_features, clf = clf)
    print("Alpha:", element, " ", round(tagger_task_3b.evaluate(news_dev_test), 4))

Alpha: 0.01   0.8499
Alpha: 0.1   0.9265
Alpha: 1.0   0.9518
Alpha: 10.0   0.9555
Alpha: 100.0   0.9533


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Alpha: 1000.0   0.95


## 1.4.1a

In [15]:
def further_improved_pos_features(sentence, i, history):
    """
    Improve pos_features to not only consider the previous word, but the word itself.
    """
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:],
                "suffix(4)": sentence[i]}       # Word
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        
    if i < len(sentence) - 1:
        features["next-word"] = sentence[i+1]
    else:
        "<END>"
    return features


clf = LogisticRegression(C = 10.0, max_iter = 1000)
tagger_task_4a = ScikitConsecutivePosTagger(news_train, features = further_improved_pos_features, clf = clf)
print(round(tagger_task_4a.evaluate(news_dev_test), 4))

0.9663


## 1.4.2b

In [16]:
def further_further_improved_pos_features(sentence, i, history):
    """
    Improve pos_features to not only consider the previous word, but the word itself.
    """
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:],
                "suffix(4)": sentence[i]}       # Word
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        
    """ # No improvments
    if sentence[i][0].isupper():
        features["word-first-upper"] = 1
    else:
        0
    """
    
    """ # No improvments
    if "-" in sentence[i]:
        features["word-hypen"] = 1
    else:
        0
    """
        
    if i < len(sentence) - 1:
        features["next-word"] = sentence[i+1]
    else:
        "<END>"
        
    return features

clf = LogisticRegression(C = 10.0, max_iter = 1000)
tagger_task_4b = ScikitConsecutivePosTagger(news_train, features = further_further_improved_pos_features, clf = clf)
print(round(tagger_task_4b.evaluate(news_dev_test), 4))

0.9663


## 1.5.1a

In [17]:
# No improvments gained. Therefore will continue with previous tagger
best_pos_features = further_improved_pos_features

clf = LogisticRegression(C = 10.0, max_iter = 1000)
tagger_task_5a = ScikitConsecutivePosTagger(news_train, features=best_pos_features, clf = clf)
print(round(tagger_task_5a.evaluate(news_test), 4))

0.9659


Our result is a little bit worse than what we got on `news_dev_test`

## 1.5.2b

In [20]:
all_sents = ["belles_lettres", "editorial", "fiction", "government", "humor", "learned",
       "lore", "mystery", "religion", "reviews", "romance", "science_fiction"]

tagged_sents_5b = brown.tagged_sents(categories = all_sents, tagset = "universal")
rest_test, rest_dev_rest, rest_train = divide_data(tagged_sents_5b, [0.1,0.1, 0.8])
train = rest_train + news_train
test = rest_test + news_test

In [18]:
# Rest same as 1b which I didn't quite manage

## 1.5.3c

In [19]:
clf = LogisticRegression(C = 10.0, max_iter = 1000)
tagger_task_5dc = ScikitConsecutivePosTagger(train, features = best_pos_features, clf = clf)
print(round(tagger_task_5cd.evaluate(test), 4))

NameError: name 'train' is not defined

This didn't run on my computer as it drained my entire memory. But it should be correct. 

Comment on task: When I try to run 1.5.3b I get a "memory limit reached" problem and my kernel restarts. Unfortunately my laptop is quite old and cannot handle this task. However, I have implemented the subsequent tasks following. But I cannot test them

## 1.5.4d

In [None]:
adventure = brown.tagged_sents(categories = 'adventure', tagset = 'universal')
print(round(tagger_task_5cd.evaluate(adventure), 4))

hobbies   = brown.tagged_sents(categories = 'hobbies',   tagset = 'universal')
print(round(tagger_task_5cd.evaluate(hobbies), 4))

## 1.6.1a

In [None]:
news_hmm_tagger_task_6a  = nltk.HiddenMarkovModelTagger.train(news_train)
print(round(tagger_task_6a.evaluate(news_dev_test), 4))

news_hmm_tagger2_task_6a = nltk.HiddenMarkovModelTagger.train(train)
print(round(tagger2_task_6a.evaluate(test), 4))

## 1.6.2b

In [None]:
per_tagger = nltk.PerceptronTagger(load = False)
per_tagger.train(news_train)
print(round(per_tagger.evaluate(news_dev_test), 4))

per_tagger2 = nltk.PerceptronTagger(load = False)
per_tagger2.train(train)
print(round(per_tagger2.evaluate(test), 4))

I can't compare the speed due to the reason I mentioned above

# Part b

## Exercise 1

### a)

In [21]:
import logging
import gensim.downloader

word_vec = gensim.downloader.load("glove-wiki-gigaword-100")

words = list()
for word in word_vec.vocab:
    words.append(word)
    
print("Nr of words:", len(words))

Nr of words: 400000


### b

In [22]:
def norm(word_vec):
    return np.sqrt(np.sum(word_vec * word_vec))

# Formula https://en.wikipedia.org/wiki/Cosine_similarity under Definition
def cos_between_vecs(word_vec1, word_vec2):
    return np.dot(word_vec1, word_vec2) / (norm(word_vec1) * norm(word_vec2))

### c

In [23]:
test1 = cos_between_vecs(word_vec["king"], word_vec["queen"])
test2 = word_vec.similarity("king", "queen")
print("Similarity with my func:", test1, "\nSimilarity with gensim func:", test2)

Similarity with my func: 0.750769 
Similarity with gensim func: 0.7507691


Very similar results

## Exercise 2

### a

In [24]:
wv = word_vec
print(wv.most_similar('car', topn=5))
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

# Oslo not in my model
#print(wv.most_similar(positive=['Oslo', 'Sweden'], negative=['Norway'], topn=5))

print(wv.most_similar(positive=['man', 'queen'], negative = ['king'], topn=5))
print(wv.most_similar(positive=['queen', 'man'], negative=['king'], topn=5))

print(wv.most_similar(positive=['kitten', 'dog'], negative=['cat'], topn=5))

[('vehicle', 0.8630838394165039), ('truck', 0.8597878217697144), ('cars', 0.837166965007782), ('driver', 0.8185911178588867), ('driving', 0.7812635898590088)]
[('suv', 0.8626778721809387), ('truck', 0.8475820422172546), ('vehicle', 0.8358194231987), ('jeep', 0.791013777256012), ('cars', 0.7910056114196777)]
[('woman', 0.8183382749557495), ('girl', 0.7466667890548706), ('she', 0.695443332195282), ('her', 0.6720750331878662), ('mother', 0.6705917119979858)]
[('woman', 0.8183382749557495), ('girl', 0.7466667890548706), ('she', 0.695443332195282), ('her', 0.6720750331878662), ('mother', 0.6705917119979858)]
[('puppy', 0.6856015920639038), ('rottweiler', 0.5782471299171448), ('puppies', 0.5646027326583862), ('pug', 0.5142439007759094), ('toddler', 0.5109227895736694)]


### b

In [25]:
a = wv['king'] + wv['woman'] - wv['man']

for noun in ["queen", "woman", "man", "king"]:
    #cos_score = wv.similarity(a, noun) # Gives error?
    cos_score = cos_between_vecs(a, wv[noun])
    print("For", noun, "the cosine was:", cos_score)


wv.similar_by_vector(a)

For queen the cosine was: 0.7834413
For woman the cosine was: 0.55754864
For man the cosine was: 0.39337876
For king the cosine was: 0.85518366


[('king', 0.8551837205886841),
 ('queen', 0.7834413647651672),
 ('monarch', 0.6933802366256714),
 ('throne', 0.6833109855651855),
 ('daughter', 0.680908203125),
 ('prince', 0.6713142395019531),
 ('princess', 0.664408266544342),
 ('mother', 0.6579325199127197),
 ('elizabeth', 0.6563301086425781),
 ('father', 0.6392419338226318)]

### c

In [26]:
# Doesn't work on my model
#print(wv.doesnt_match(['Norway', 'Denmark', 'Finland', 'Sweden', 'Spain', 'Stockholm']))

print(wv.doesnt_match(["car", "bus", "train", "ship", "plane", "bike"]))
print(wv.doesnt_match(["apple", "orange", "pear", "banana", "lemon", "tomatoes"]))
print(wv.doesnt_match(["dog", "cat", "sheep", "cow", "pig", "horse"]))

ship
tomatoes
horse


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


## Exercise 3

### a

In [27]:
from gensim.models import Word2Vec
model_brown = Word2Vec(sentences = brown.sents())

words_in_brown = list()
for word in model_brown.wv.vocab:
    words_in_brown.append(word)
    
print("Difference between words in glove-wiki and brown:", len(words) / len(words_in_brown))

Difference between words in glove-wiki and brown: 26.36261780794833


Glove-Wiki is about 26 times larger than Brown

### c

In [28]:
print(model_brown.wv.most_similar(positive=['man', 'queen'], negative = ['king'], topn=5))
print(model_brown.wv.most_similar(positive=['queen', 'man'], negative=['king'], topn=5))

print(model_brown.wv.most_similar(positive=['kitten', 'dog'], negative=['cat'], topn=5))

[('boy', 0.8149065971374512), ('woman', 0.7971924543380737), ('girl', 0.7894347906112671), ('young', 0.7244738936424255), ('old', 0.715161919593811)]
[('boy', 0.8149065971374512), ('woman', 0.7971924543380737), ('girl', 0.7894347906112671), ('young', 0.7244738936424255), ('old', 0.715161919593811)]
[('follow', 0.8970865607261658), ('choose', 0.8933405876159668), ('enrich', 0.8910219669342041), ('greeting', 0.8892170786857605), ('begin', 0.8875006437301636)]


## Exercise 4

In [30]:
from gensim.test.utils import datapath

path=datapath('questions-words.txt')

model_evaluated = wv.evaluate_word_analogies(path)
print(model_evaluated[0])

0.6329672585445961
