In [75]:
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import spacy
from operator import itemgetter
import numpy as np
import io
import random
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import time
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report

nlp = spacy.load('en_core_web_sm')

In [2]:
#Check if cuda is available
cuda = torch.cuda.is_available()
print('CUDA is', cuda)

num_workers = 8 if cuda else 0
print(num_workers)

CUDA is False
0


In [3]:
with io.open('../Data/glove.6B.50d.txt', 'r', encoding='utf8') as f:
    glove_file = f.read()
    
glove_sentences = glove_file.splitlines()
glove_vocab = {}
for sentence in glove_sentences:
    word = sentence.split()[0]
    embedding = np.array(sentence.split()[1:], dtype = float)
    glove_vocab[word] = embedding

In [4]:
f = open('../Data/furniture_cleaned-tagged_m.json',) 
data = json.load(f)

#CALCULATING AMBIGUITY SCORES IN IS ADJECTIVES
ambiguity_m = {}
for element in data[-1]:
    if element[3] == 'a':
        score = 0
    else:
        score = 1
    ambiguity_m[element[0]] = [element[2], score]
    
    
f = open('../Data/furniture_cleaned-tagged_a.json',) 
data = json.load(f)

#CALCULATING AMBIGUITY SCORES IN IS ADJECTIVES
ambiguity_a = {}
for element in data[-1]:
    if element[3] == 'a':
        score = 0
    else:
        score = 1
    ambiguity_a[element[0]] = [element[2], score]
    

In [27]:
#GIVING PREFERENCE TO Manuel'S LABELS. REVERSE THE ORDER TO GIVE PREFERENCE TO Akshat'S LABELS
ambiguity = {}

for adj in ambiguity_m:
    if int(ambiguity_m[adj][0]) !=0 and adj in glove_vocab:
        ambiguity[adj] = ambiguity_m[adj][1]
        
for adj in ambiguity_a:
    if int(ambiguity_a[adj][0]) !=0 and adj not in ambiguity and adj in glove_vocab:
        ambiguity[adj] = ambiguity_a[adj][1]

In [37]:
all_data = []
for adj in ambiguity:
    all_data.append([glove_vocab[adj], ambiguity[adj]])
    
random.shuffle(all_data)
size = len(all_data)
#training_data = all_data[:int(size*0.9)]
#test_data = all_data[int(size*0.9):]

#creating a balanced test set
split = 0.1 #for test set
n_test = int((size * split)/2)
counters = [0, 0]
training_data = []
test_data = []

for word, label in all_data:
    if counters[label] < n_test:
        test_data.append([word, label])
        counters[label] += 1
    else:
        training_data.append([word, label])


In [38]:
class MyDataset(Dataset):
    def __init__(self, X):
        self.X = X
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self,index):

        return torch.from_numpy(self.X[index][0]).float(), self.X[index][1]

In [39]:
batch_size=8
train_dataset = MyDataset(training_data)
train_loader = DataLoader(train_dataset, shuffle = True, batch_size = batch_size)

test_dataset = MyDataset(test_data)
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = batch_size)

In [40]:
class My_MLP_Model(nn.Module):
    def __init__(self):
        super(My_MLP_Model, self).__init__()
        #self.batchnorm1 = nn.BatchNorm1d(50)
        self.fc1 = nn.Linear(50, 128)
        
        #self.batchnorm2 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(128, 8)
        
        #self.batchnorm3 = nn.BatchNorm1d(32)
        #self.fc3 = nn.Linear(32, 8)
        
        #self.batchnorm_last = nn.BatchNorm1d(8)
        self.fc_last = nn.Linear(8, 2)
        

    def forward(self, x):
        #x = self.batchnorm1(x)
        x = F.relu(self.fc1(x))
        
        #x = self.batchnorm2(x)
        x = F.relu(self.fc2(x))
        
        #x = self.batchnorm3(x)
        #x = F.relu(self.fc3(x))
        
        #x = self.batchnorm_last(x)
        x = self.fc_last(x)

        return x

In [72]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0
    
    predictions = []
    ground_truth = []
    
    start_time = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):   
        optimizer.zero_grad()   # .backward() accumulates gradients
        data = data.to(device)
        target = target.to(device) # all data & model on same device

        outputs = model(data)
        loss = criterion(outputs, target)
        running_loss += loss.item()
        
        #calculating accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += target.size(0)
        correct_predictions += (predicted == target).sum().item()
            
        #calculuating confusion matrix
        predictions += list(predicted.numpy())
        ground_truth += list(target.numpy())
        

        loss.backward()
        optimizer.step()
    
    end_time = time.time()
    
    print('------ Training -----')
    print('')
    #print('Confusion Matrix')
    #print(confusion_matrix(ground_truth, predictions))
    #print('F1 scores')
    #print(classification_report(ground_truth, predictions))
    running_loss /= len(train_loader)
    acc = (correct_predictions/total_predictions)*100.0
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    print('Training Accuracy: ', acc, '%')
    return running_loss

In [73]:
def validate_model(model, validate_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0
        
        predictions = []
        ground_truth = []

        for batch_idx, (data, target) in enumerate(validate_loader):   
            data = data.to(device)
            target = target.to(device)

            outputs = model(data)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += target.size(0)
            correct_predictions += (predicted == target).sum().item()
            
            #calculuating confusion matrix
            predictions += list(predicted.numpy())
            ground_truth += list(target.numpy())

            loss = criterion(outputs, target).detach()
            running_loss += loss.item()

        print('------ Testing -----')
        print('')
        print('Confusion Matrix')
        print(confusion_matrix(ground_truth, predictions))
        print('F1 scores')
        print(classification_report(ground_truth, predictions))
        running_loss /= len(validate_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        print('Testing Accuracy: ', acc, '%')
        return running_loss, acc






In [76]:
model = My_MLP_Model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
device = torch.device("cuda" if cuda else "cpu")
model.to(device)

for i in range(10):
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    test_loss, test_acc = validate_model(model, test_loader, criterion)

    print('='*20)

------ Training -----

Training Loss:  0.47735679231277883 Time:  0.11297202110290527 s
Training Accuracy:  79.7598627787307 %
------ Testing -----

Confusion Matrix
[[ 0 32]
 [ 0 32]]
F1 scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.50      1.00      0.67        32

    accuracy                           0.50        64
   macro avg       0.25      0.50      0.33        64
weighted avg       0.25      0.50      0.33        64

Testing Loss:  0.7997210882604122
Testing Accuracy:  50.0 %
------ Training -----

Training Loss:  0.35423441517026455 Time:  0.0849299430847168 s
Training Accuracy:  79.7598627787307 %
------ Testing -----

Confusion Matrix
[[ 0 32]
 [ 0 32]]
F1 scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.50      1.00      0.67        32

    accuracy                           0.50        64
   mac

  _warn_prf(average, modifier, msg_start, len(result))


------ Testing -----

Confusion Matrix
[[ 0 32]
 [ 0 32]]
F1 scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.50      1.00      0.67        32

    accuracy                           0.50        64
   macro avg       0.25      0.50      0.33        64
weighted avg       0.25      0.50      0.33        64

Testing Loss:  0.8326409794390202
Testing Accuracy:  50.0 %
------ Training -----

Training Loss:  0.30977777366156445 Time:  0.09466385841369629 s
Training Accuracy:  79.7598627787307 %
------ Testing -----

Confusion Matrix
[[ 0 32]
 [ 0 32]]
F1 scores
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.50      1.00      0.67        32

    accuracy                           0.50        64
   macro avg       0.25      0.50      0.33        64
weighted avg       0.25      0.50      0.33        64

Testing Loss:  0.871250

In [47]:
import sklearn

In [69]:
print(dir(sklearn.metrics))

['ConfusionMatrixDisplay', 'PrecisionRecallDisplay', 'RocCurveDisplay', 'SCORERS', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_base', '_classification', '_pairwise_fast', '_plot', '_ranking', '_regression', '_scorer', 'accuracy_score', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'auc', 'average_precision_score', 'balanced_accuracy_score', 'brier_score_loss', 'calinski_harabasz_score', 'calinski_harabaz_score', 'check_scoring', 'classification_report', 'cluster', 'cohen_kappa_score', 'completeness_score', 'confusion_matrix', 'consensus_score', 'coverage_error', 'davies_bouldin_score', 'dcg_score', 'euclidean_distances', 'explained_variance_score', 'f1_score', 'fbeta_score', 'fowlkes_mallows_score', 'get_scorer', 'hamming_loss', 'hinge_loss', 'homogeneity_completeness_v_measure', 'homogeneity_score', 'jaccard_score', 'jaccard_similarity_score', 'label_ranking_average_precision_score', 'label_r

In [71]:
print(help(sklearn.metrics.classification_report))

Help on function classification_report in module sklearn.metrics._classification:

classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')
    Build a text report showing the main classification metrics
    
    Read more in the :ref:`User Guide <classification_report>`.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.
    
    target_names : list of strings
        Optional display names matching the labels (same order).
    
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    
    digits : int
        Number of digits for for