# Assignment - Probabilistic Graphical Models
### Year 2020-2021- Semester I
### CCE5225
####  Developed by - Adrian Muscat, 2020
---
Zachary Cauchi, 197999M, BSc CS, Yr I

Submit a pdf version (with the attached plagiarism form) of the final jupyter notebook (as a turn-it-in job on VLE) and the jupyter notebook itself separately (as an assignment job on VLE)

This assignment is to be attempted individually. It is essential that the work you submit and present consists only of your own work; use of copied material will be treated as plagiarism. Discussion is only permitted on general issues, and it is absolutely forbidden to discuss specific details with anyone and/or share results.



In [1]:
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import pandas
from collections import Counter

def saveAnswer(obj, name):
    answer_file = open(f'saved_answers/{name}.pkl', 'wb')
    pickle.dump(obj, answer_file)
    answer_file.close()

In [2]:
infile = open('MLC_data_2020_21.pkl','rb')
data = pickle.load(infile, encoding='latin1')
infile.close()

In [3]:
# Explore dataset
print("First split is into :",data.keys(),'\n')
#
# Lets explore the development set
# This is organised into three lists
print("The three lists are",data['development'].keys(),'\n')
#
# The first element of each list corresponds to the object_labels, 
# geomteric features and output labels for the first example
# ...and so on
# Lets explore the first example
train_obj_labels = data['development']['object_labels']
train_out_labels = data['development']['output_labels']
train_geo_feat = data['development']['geometric_features']
test_obj_labels = data['test']['object_labels']
test_out_labels = data['test']['output_labels']
test_geo_feat = data['test']['geometric_features']
print("There are",len(train_obj_labels), "examples in dev set\n")
print("First example:")
print(train_obj_labels[0])
print(train_out_labels[0])
print(train_geo_feat[0])
print("\nSecond example:")
print(train_obj_labels[1])
print(train_out_labels[1])
print(train_geo_feat[1])
print("\n...")

First split is into : dict_keys(['development', 'test']) 

The three lists are dict_keys(['object_labels', 'output_labels', 'geometric_features']) 

There are 4253 examples in dev set

First example:
['2008_001130.jpg', 'tvmonitor', 'bottle']
['next_to', 'at_the_level_of', 'near']
[ 0.68888274  0.07051991  0.          0.88679245  0.39215686  0.63316053
  0.109375    1.36170213  1.14893617  1.06603774  0.58490566  9.76862745
  0.5546875  -0.30530973]

Second example:
['2008_002210.jpg', 'person_2', 'diningtable']
['behind', 'opposite', 'near']
[ 0.43984962  0.28696742  0.16        0.40206186  2.36082474  0.48306117
  0.          2.27350427  0.31623932  1.          0.66666667  1.53275109
  0.34962406 -0.33333333]

...


In [4]:
# Example 
# Learning the one-hot encoder

# read all prepositions in multilabel examples and flatten
all_preps=[]
for Y in data['development']['output_labels']:
    for y in Y:
        all_preps.append(y)

values = np.array(all_preps).reshape(len(all_preps),)
print("Shape of values", values.shape,'\n')

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Unique labels:\n",label_encoder.classes_,'\n')

# onehot encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("Example of One-Hot Encoded:\n", onehot_encoded[0],'\n')

# single label encoding for first example
print("Consider first example\n")
b = np.array(data['development']['output_labels'][0])
b = b.reshape(len(b),1)
print("Output Labels:\n",b)
print("\nOne-Hot encoded labels:")
for i in b:
    a = label_encoder.transform(i)
    print(onehot_encoder.transform(a.reshape(-1, 1))[0])



Shape of values (9180,) 

Unique labels:
 ['above' 'against' 'along' 'around' 'at_the_level_of' 'behind' 'beyond'
 'far from' 'in' 'in_front_of' 'near' 'next_to' 'none' 'on' 'opposite'
 'outside_of' 'under'] 

Example of One-Hot Encoded:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] 

Consider first example

Output Labels:
 [['next_to']
 ['at_the_level_of']
 ['near']]

One-Hot encoded labels:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


# Section 1: Preparing the data

In [5]:
# 1.a. Computing the mean output label count per example, per dataset (development and test)
average_out_count_train = 0
average_out_count_test = 0

for row in train_out_labels:
    average_out_count_train += len(row)
for row in test_out_labels:
    average_out_count_test += len(row)

average_out_count_train /= len(train_out_labels)
average_out_count_test /= len(test_out_labels)

print('Answer to 1.a:')
print('Mean output labels per row (train set): ', average_out_count_train)
print ('Mean output labels per row (test set): ', average_out_count_test)

saveAnswer({
    'train_average_out': average_out_count_train,
    'test_average_out': average_out_count_test
}, '1a')


Answer to 1.a:
Mean output labels per row (train set):  2.1584763696214435
Mean output labels per row (test set):  2.148496240601504


In [6]:
# 1.b. Flatten the output labels to a 1-d array, computing the distribution for both datasets

# Flatten the labels into a 1D array
flat_out_train = np.concatenate(train_out_labels)
flat_out_test = np.concatenate(test_out_labels)

# Count the numbers of each label
train_out_counts = Counter(flat_out_train)
test_out_counts = Counter(flat_out_test)

# Create dataframes from each counter object above.
train_out_counts_df = pandas.DataFrame.from_dict(train_out_counts, orient='index')
train_out_counts_df.index.name = 'Label distribution in development (train) set'
test_out_counts_df = pandas.DataFrame.from_dict(test_out_counts, orient='index')
test_out_counts_df.index.name = 'Label distribution in test set'

print("Results for 1.b:")
display(train_out_counts_df)
display(test_out_counts_df)

saveAnswer({
    'train_out_counts': train_out_counts_df,
    'test_out_counts': test_out_counts_df
}, '1b')

Results for 1.b:


Unnamed: 0_level_0,0
Label distribution in development (train) set,Unnamed: 1_level_1
next_to,1411
at_the_level_of,926
near,2276
behind,1055
opposite,267
on,359
in_front_of,1102
above,117
under,432
far from,376


Unnamed: 0_level_0,0
Label distribution in test set,Unnamed: 1_level_1
in_front_of,270
against,136
next_to,359
at_the_level_of,227
near,578
under,101
behind,270
far from,100
on,88
opposite,66


In [7]:
# 1.c. Computing the composite output labels (without flattening like in 1.b) for both datasets.

# Same as above, compute the occurances of each composite output label.
# Unlike above, we first need to transform each row from an unhashable list to a hashable tuple object.
train_cmp_out_counts = Counter(map(tuple, train_out_labels))
test_cmp_out_counts = Counter(map(tuple, test_out_labels))

train_cmp_out_counts_df = pandas.DataFrame.from_dict(train_cmp_out_counts, orient='index')
train_cmp_out_counts_df.index.name = 'Composite output label distribution in development (train) set'
test_cmp_out_counts_df = pandas.DataFrame.from_dict(test_cmp_out_counts, orient='index')
test_cmp_out_counts_df.index.name = 'Composite output label distribution in test set'

print('Results for 1.c:')
display(train_cmp_out_counts_df)
display(test_cmp_out_counts_df)

saveAnswer({
    'train_out_counts': train_cmp_out_counts_df,
    'test_out_counts': test_cmp_out_counts_df
}, '1c')


Results for 1.c:


Unnamed: 0_level_0,0
Composite output label distribution in development (train) set,Unnamed: 1_level_1
"(next_to, at_the_level_of, near)",509
"(behind, opposite, near)",3
"(on,)",135
"(in_front_of, near)",269
"(near, behind)",31
...,...
"(opposite, beyond)",1
"(in_front_of, opposite, under)",1
"(outside_of, next_to, at_the_level_of, near)",1
"(in_front_of, next_to, opposite, near)",1


Unnamed: 0_level_0,0
Composite output label distribution in test set,Unnamed: 1_level_1
"(in_front_of, against)",7
"(next_to, at_the_level_of, near)",132
"(under,)",53
"(at_the_level_of,)",9
"(in_front_of, next_to, at_the_level_of, near)",2
...,...
"(above, next_to, against, behind, near)",1
"(in, on)",1
"(in_front_of, next_to, against)",1
"(around, against, near)",1


In [20]:
# 1.d. Compute a word-word co occurrence probability distribution
train_1d_out_labels = [' '.join(label) for label in train_out_labels]

from sklearn.feature_extraction.text import CountVectorizer

count_model = CountVectorizer(ngram_range=(1,2), vocabulary=label_encoder.classes_) # default unigram model
X = count_model.fit_transform(train_1d_out_labels)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
df = pandas.DataFrame(Xc.todense(), index = count_model.vocabulary_, columns=count_model.vocabulary_)
display(df) # print out matrix in dense format


Unnamed: 0,above,against,along,around,at_the_level_of,behind,beyond,far from,in,in_front_of,near,next_to,none,on,opposite,outside_of,under
above,117,4,0,0,6,34,2,10,1,14,67,24,0,5,6,2,0
against,4,593,5,5,101,71,1,0,13,67,62,136,0,205,11,3,140
along,0,5,69,0,18,21,0,1,0,22,54,56,0,0,0,0,1
around,0,5,0,34,0,2,0,0,0,0,0,0,0,0,0,0,7
at_the_level_of,6,101,18,0,926,30,1,18,0,40,718,752,0,0,46,4,5
behind,34,71,21,2,30,1055,25,161,0,68,590,199,0,1,38,13,38
beyond,2,1,0,0,1,25,42,19,0,10,6,1,0,0,3,0,1
far from,10,0,1,0,18,161,19,376,0,176,1,3,0,0,10,10,5
in,1,13,0,0,0,0,0,0,56,0,0,0,0,25,0,0,0
in_front_of,14,67,22,0,40,68,10,176,0,1102,601,209,0,18,72,17,29


In [16]:
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd
 
 
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))

    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index

matrix, vocab_index = generate_co_occurrence_matrix(np.concatenate(train_out_labels))
 
 
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
display(data_matrix)

# WIP 1.d

Unnamed: 0,outside_of,none,next_to,at_the_level_of,against,on,above,behind,under,beyond,around,opposite,far from,near,along,in_front_of,in
outside_of,0.0,0.0,4.0,1.0,1.0,1.0,3.0,6.0,0.0,0.0,0.0,0.0,1.0,15.0,0.0,11.0,0.0
none,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,12.0,1.0,0.0,1.0
next_to,13.0,7.0,13.0,72.0,159.0,38.0,19.0,169.0,83.0,9.0,7.0,46.0,80.0,485.0,11.0,184.0,15.0
at_the_level_of,2.0,1.0,735.0,5.0,11.0,5.0,2.0,20.0,4.0,0.0,0.0,23.0,13.0,82.0,1.0,21.0,1.0
against,1.0,1.0,29.0,52.0,11.0,185.0,3.0,49.0,129.0,2.0,5.0,4.0,9.0,72.0,1.0,39.0,1.0
on,1.0,3.0,7.0,11.0,38.0,1.0,3.0,21.0,22.0,4.0,2.0,0.0,24.0,184.0,1.0,31.0,6.0
above,0.0,0.0,13.0,5.0,11.0,5.0,0.0,23.0,3.0,0.0,1.0,5.0,3.0,36.0,0.0,11.0,1.0
behind,7.0,3.0,119.0,45.0,99.0,44.0,18.0,35.0,60.0,5.0,5.0,31.0,94.0,382.0,1.0,100.0,7.0
under,0.0,1.0,9.0,20.0,30.0,11.0,1.0,34.0,6.0,2.0,2.0,7.0,26.0,228.0,1.0,49.0,5.0
beyond,0.0,0.0,0.0,1.0,1.0,0.0,1.0,18.0,0.0,0.0,0.0,2.0,8.0,3.0,0.0,8.0,0.0
