In [143]:
import pandas as pd 
import numpy as np
import seaborn as sns
import re
import os
from matplotlib import pyplot as plt
#from emfdscore.scoring import score_docs
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [68]:
df= pd.read_csv('transcripts/emfd_out.csv')
df = df[df['count']>=60]
y = df.pop('Y')
df.insert(0, 'Y', y)
data = df.drop(df.loc[:, 'care_sent':'count'].columns, axis = 1)
data.reset_index(inplace=True)
data.drop(['index'], axis = 1, inplace=True)

In [134]:
train, valid, test = np.split(data.sample(frac=1, random_state=42), [int(.7*len(data)), int(.9*len(data))])

In [140]:
Vocab_char = {}
Vocab_char[0] = '<unk>'
i=1
for char in train['Y'].unique():
    Vocab_char[i] = char
    i+=1

def get_key(val):
    for key, value in Vocab_char.items():
         if val == value:
             return key

def preprocess_data(df):
    rv = []
    records = df.loc[:, 'care_p':'sanctity_p'].to_records(index=False)
    results = list(records)
    for i in range(len(df)):
        rv.append((df['Y'].iloc[i], results[i]))
    return rv

def collate_fn(batch):
    
    speech_mf = []
    labels = []
    
    for b in batch:
        label = get_key(b[0])
        labels.append(label)
        s = [i for i in b[1]]
        speech_mf.append(s)
    print(labels)
    speech_mf = torch.tensor(speech_mf)
    labels = torch.tensor(labels)
    
    return labels, speech_mf
    

In [147]:
num_labels = len(Vocab_char)
vocab_size = 5

class NNeMFDTagger(nn.Module):
    def __init__(self, num_labels, vocab_size):

        super(NNeMFDTagger, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)


    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [150]:
loss_function = torch.nn.NLLLoss()

def train_an_epoch(dataloader):
    model.train()
    log_interval = 500

    for idx, (label, speech_mf) in enumerate(dataloader):
        model.zero_grad()
        print(speech_mf)
        probs = model(speech_mf)
        loss = loss_function(probs, label)
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')

In [103]:
def get_accuracy(dataloader):
    model.eval()
    with torch.no_grad():    
        total_acc, total_count = 0, 0
        for idx, (label, speech_mf) in enumerate(dataloader):
            log_probs = model(speech_mf)
            total_acc += (log_probs.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [141]:
BATCH_SIZE = 64 
  
train_data = preprocess_data(train)
valid_data = preprocess_data(valid)
test_data = preprocess_data(test)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE,
                              shuffle=True, 
                              collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE,
                              shuffle=False, 
                              collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE,
                             shuffle=False, 
                             collate_fn=collate_fn)

In [148]:
model = NNeMFDTagger(len(Vocab_char),5)

In [151]:
import time
import matplotlib.pyplot as plt
%matplotlib inline

EPOCHS = 3 # epoch
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

accuracies=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(train_dataloader)
    accuracy = get_accuracy(valid_dataloader)
    accuracies.append(accuracy)
    time_taken = time.time() - epoch_start_time
    print(f'Epoch: {epoch}, time taken: {time_taken:.1f}s, validation accuracy: {accuracy:.3f}.')
    
plt.plot(range(1, EPOCHS+1), accuracies)

[26, 1, 1, 21, 51, 10, 2, 1, 1, 10, 1, 35, 14, 21, 17, 1, 1, 9, 1, 1, 51, 10, 10, 10, 1, 1, 1, 27, 16, 22, 48, 1, 32, 1, 42, 29, 25, 5, 10, 10, 20, 39, 25, 5, 37, 1, 29, 1, 52, 10, 10, 2, 3, 12, 14, 14, 10, 42, 3, 29, 31, 25, 30, 25]
tensor([[0.1143, 0.1032, 0.0933, 0.0985, 0.0877],
        [0.0940, 0.0993, 0.0910, 0.0838, 0.0776],
        [0.1155, 0.0968, 0.1213, 0.0940, 0.0723],
        [0.1185, 0.0998, 0.1306, 0.1183, 0.0948],
        [0.0983, 0.1095, 0.0925, 0.1025, 0.0838],
        [0.1179, 0.1152, 0.0922, 0.0912, 0.0695],
        [0.1379, 0.1140, 0.1068, 0.0900, 0.1020],
        [0.0917, 0.0928, 0.0964, 0.0906, 0.0741],
        [0.0376, 0.0707, 0.1099, 0.1149, 0.0415],
        [0.1119, 0.0906, 0.1017, 0.0860, 0.0799],
        [0.0598, 0.0845, 0.0931, 0.0910, 0.0667],
        [0.1187, 0.1118, 0.0946, 0.0915, 0.0860],
        [0.0977, 0.0893, 0.0941, 0.1152, 0.0716],
        [0.0802, 0.0996, 0.0911, 0.0894, 0.0746],
        [0.1113, 0.1056, 0.1044, 0.0901, 0.0853],
        [0.1091,

RuntimeError: expected scalar type Float but found Double