In [16]:
%tensorflow_version 1.x #for preprocessing

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.x #for preprocessing`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


In [17]:
import torch
print('Version', torch.__version__)
print('CUDA enabled:', torch.cuda.is_available())
  
# Running this should then print out:
# Version 1.7.0+cu101 (or something like this)
# CUDA enabled: True

Version 1.8.0+cu101
CUDA enabled: True


In [18]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
MyDrive  Shareddrives


In [19]:
import os
BASE_PATH = '/gdrive/My Drive/colab_files/'
DATA_PATH = '/gdrive/My Drive/colab_files/nlp_project_files/'

#MODIFY FILE NAMES AND PATHS AS NEEDED

if not os.path.exists(DATA_PATH):
  os.makedirs(DATA_PATH)
  print(os.getcwd())

  os.chdir(BASE_PATH)
  !ls
  !tar -zxf nlp_project.tar.gz -C nlp_project_files
  print('Extracted all files')

os.chdir(DATA_PATH)
!ls

NLPproject


In [20]:
DATA_PATH = '/gdrive/My Drive/colab_files/nlp_project_files/NLPproject/data'
CODE_PATH = '/gdrive/My Drive/colab_files/nlp_project_files/NLPproject/HENIN-master'
PREPROCESSED_DATA = '/gdrive/My Drive/colab_files/nlp_project_files/NLPproject/HENIN-master/preprocessData'
os.chdir(CODE_PATH)

!ls

if not os.path.exists(PREPROCESSED_DATA):
  os.makedirs('preprocessData')
  !python3 preprocessing.py


 images      preprocessData	   preprocessing.py   README.md   utils.py
 layers.py  'preprocessData (1)'   __pycache__	      train.py


In [21]:
import pickle
import numpy as np

os.chdir(PREPROCESSED_DATA)
!ls

'Copy of multi_hot_usersINSTA12.pickle'   Dat4ModelUNLAB.pickle
 Dat4ModeINSTA.pickle			  Dat4ModelVINE12.pickle
 Dat4ModelINSTA12.pickle		  Dat4ModelVINE25.pickle
 Dat4ModelINSTA25.pickle		  Dat4ModelVINE.pickle
 Dat4ModelINSTA_SIXTH.pickle		  Dat4ModelVINE_SIXTH.pickle
'Dat4ModelINSTA_THIRD (1).pickle'	  Dat4ModelVINE_THIRD.pickle
 Dat4ModelINSTA_THIRD.pickle		  multi_hot_users25.pickle
 Dat4Model.pickle			  multi_hot_users.pickle


In [22]:
def get_data(file_name):
    with open(file_name, 'rb') as f:
        Dat4Model = pickle.load(f)
    textFeat_all = Dat4Model['textFeat_all']
    y_all = Dat4Model['y_all']

    return textFeat_all, y_all

In [23]:
def stratify_data(y_all, textFeat_all, num_folds):
    p = y_all.argsort()
    y_sorted = y_all[p]
    text_sorted = textFeat_all[p]

    first = np.where(y_sorted==1)
    first_idx = first[0][0] #1535

    size_zeroes = first[0][0]
    size_ones = len(y_sorted) - size_zeroes

    size_fold_z = int (size_zeroes / num_folds)
    size_fold_o = int (size_ones / num_folds)
    folds = []
    ys = []

    for i in range(num_folds):
        if i == 0:  
            start_z = 0
            start_o = size_zeroes
        else:
            start_z = (i*size_fold_z) 
            start_o = size_zeroes + (i*size_fold_o) 

        if i == num_folds - 1:
            stop_z = size_zeroes
            stop_o = len(y_sorted)
        else:
            stop_z = (i+1)*size_fold_z
            stop_o = size_zeroes + (i+1)*size_fold_o
        #print("text_sorted[{} : {}] text_sorted[{} : {}]".format(start_z, stop_z, start_o, stop_o))
        zeroes = text_sorted[start_z : stop_z]
        ones = text_sorted[start_o : stop_o]
        fold = np.concatenate((zeroes, ones), axis = 0)
        folds.append(fold)
        zeroes = y_sorted[start_z : stop_z]
        ones = y_sorted[start_o : stop_o]
        y = np.concatenate((zeroes, ones), axis = 0)
        ys.append(y)

    text_final = folds[0]
    y_final = ys[0]
    print("Length of dataset {}".format(len(y_all)))
    print("[Full dataset] Percentage of true labels " + str(np.sum(y_all) / len(y_all)))
    print("[Fold 0] Percentage of true labels " + str(np.sum(ys[0]) / len(ys[0])))
    for i in range(1, num_folds):
        text_final = np.concatenate((text_final, folds[i]), axis = 0)
        y_final = np.concatenate((y_final, ys[i]), axis = 0)
        print("[Fold {}] Percentage of true labels {}".format(i, np.sum(ys[i]) / len(ys[i])))

    return text_final, y_final

In [24]:
class textFeatDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, y_data, stratify=False, num_folds=1, fold_idx=1, train=True):
        super(textFeatDataset, self).__init__()

        if stratify:
            fold_size = int(len(y_data) / num_folds)
            start = int((fold_size * fold_idx) - 1)
            stop = int(fold_size * (fold_idx + 1))

            if train:
                if fold_idx == 0:
                    data = text_data[stop:]
                    labels = y_data[stop:]
                if fold_idx == num_folds - 1:
                    data = text_data[:start]
                    labels = y_data[:start]
                if fold_idx > 0 and fold_idx < num_folds - 1:
                    a = text_data[0:start]
                    b = text_data[stop:]
                    data = np.concatenate((a, b), axis=0)
                    y_a = y_data[0:start]
                    y_b = y_data[stop:]
                    labels = np.concatenate((y_a, y_b), axis=0)
            else:
                data = text_data[start+1:stop+1]
                labels = y_data[start+1:stop+1]

            self.data = data
            self.labels = labels
        else:
            self.data = text_data
            self.labels = y_data

    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        data_item = self.data[idx]
        label_item = self.labels[idx]
        return data_item, label_item

    def seqlen():
        return self.data.shape[1]

    def embeddinglen():
        return self.data.shape[2]


In [25]:
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import csv
import sys
import pickle
import re

In [26]:
class RNNNet(nn.Module):
    def __init__(self, embed_size):
        super(RNNNet, self).__init__()
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=64, num_layers=1, batch_first=True)
        self.dense = nn.Linear(64, 2)
    
    def forward(self, x, hidden=None):
        batch_size = x.shape[0]
        sequence_length = x.shape[1]

        hidden = torch.zeros(1, batch_size, 64).to(device)

        out, hidden = self.rnn(x, hidden)
        out = hidden.contiguous().view(-1, 64)
        out = self.dense(out)

        return out, hidden

model = RNNNet(300)

#from torchsummary import summary
#summary(model.cuda(), data.shape)

In [27]:
class GRUNet(nn.Module):
    def __init__(self, embed_size):
        super(GRUNet, self).__init__()
        self.rnn = nn.GRU(input_size=embed_size, hidden_size=128, num_layers=1, batch_first=True)
        self.dense = nn.Linear(128, 2)
    
    def forward(self, x, hidden=None):
        batch_size = x.shape[0]
        sequence_length = x.shape[1]

        hidden = torch.zeros(1, batch_size, 128).to(device)

        out, hidden = self.rnn(x, hidden)
        out = hidden.contiguous().view(-1, 128)
        out = self.dense(out)

        return out, hidden


In [28]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def train(model, device, train_loader, optimizer):
    model.train()
    losses = []
    criterion = F.cross_entropy
    hidden = None
    for batch_idx, (data, label) in enumerate(train_loader):
        
        data, label = data.to(device), label.to(device)

        if hidden is not None:
            hidden = repackage_hidden(hidden)

        optimizer.zero_grad()
        output, hidden = model(data)
        pred = output
        loss = criterion(pred, label, reduction='mean')
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    train_loss = np.mean(losses)
    return train_loss

def test(model, device, test_loader, traindata = False):
    model.eval()
    test_loss = 0
    correct = 0
    criterion = F.cross_entropy

    tp = 0
    fp = 0
    fn = 0
    tn = 0
    if traindata:
      name = 'Train'
    else:
      name = 'Test'
    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(test_loader):
            data, label = data.to(device), label.to(device)
            output, hidden = model(data)
            test_loss_on = criterion(output, label, reduction='sum').item()
            test_loss += test_loss_on
            pred = output.max(-1)[1]
            for i in range(len(pred)):
                if pred[i] == label[i] and pred[i] == 0:
                    tn += 1
                if pred[i] == label[i] and pred[i] == 1:
                    tp += 1
                if pred[i] != label[i] and pred[i] == 0:
                    fn += 1
                if pred[i] != label[i] and pred[i] == 1:
                    fp += 1
            correct_mask = pred.eq(label.view_as(pred))
            num_correct = correct_mask.sum().item()
            correct += num_correct

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)

    if tp == 0:
        prec = 0
        recall = 0
        f1 = 0
    else:  
        prec = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)

    #print(name + ' set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
    #    test_loss, correct, len(test_loader.dataset), test_accuracy))
    
    return test_loss, test_accuracy, prec, recall, f1

In [29]:
SEQUENCE_LENGTH = 300
BATCH_SIZE = 128
TEST_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0
USE_CUDA = True
NUM_FOLDS = 5
NUM_ITERS = 5

use_cuda = USE_CUDA and torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")
print('Using device', device)
import multiprocessing
num_workers = multiprocessing.cpu_count()
print('num workers:', num_workers)

kwargs = {'num_workers': num_workers,
          'pin_memory': True} if use_cuda else {}

def train_model(model_type, train_file_name, test_file_name, learning_rate, weight_decay, epochs, batch_size):
    textFeat_train, y_train = get_data(train_file_name)
    textFeat_test, y_test = get_data(test_file_name)

    data_train = textFeatDataset(textFeat_train, y_train, stratify=False)
    data_test = textFeatDataset(textFeat_test, y_test, stratify=False)

    train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(data_test, batch_size=TEST_BATCH_SIZE, shuffle=False, **kwargs)

    model = model_type(SEQUENCE_LENGTH).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    lr = learning_rate
    for epoch in range(epochs):
        if epoch%20 == 0:
            lr = lr/10
            optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
        train_loss = train(model, device, train_loader, optimizer)
        test_loss, test_accuracy, prec, recall, f1 = test(model, device, test_loader, traindata = False)

    print("Accuracy = " + str(test_accuracy))
    print("Precison = " + str(prec))
    print("Recall = " + str(recall))
    print("F1 score = " + str(f1))
    print()

def cross_validate(model_type, file_name, learning_rate, weight_decay, epochs, batch_size):
    textFeat_all, y_all = get_data(file_name)
    print(textFeat_all.shape)
    for j in range(NUM_ITERS):
        sum_test_accuracy = 0
        sum_prec = 0
        sum_recall = 0
        sum_f1 = 0

        shuffler = np.random.permutation(len(y_all))
        text_shuffled = textFeat_all[shuffler]
        y_shuffled = y_all[shuffler]
        t_final, y_final = stratify_data(y_shuffled, text_shuffled, NUM_FOLDS)

        for i in range(NUM_FOLDS):
            data_train = textFeatDataset(t_final, y_final, stratify=True, num_folds=NUM_FOLDS, fold_idx=i, train=True)
            data_test = textFeatDataset(t_final, y_final, stratify=True, num_folds=NUM_FOLDS, fold_idx=i, train=False)

            train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, **kwargs)
            test_loader = torch.utils.data.DataLoader(data_test, batch_size=TEST_BATCH_SIZE, shuffle=False, **kwargs)

            model = model_type(SEQUENCE_LENGTH).to(device)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

            best_acc = None
            best_loss = None
            best_prec = None
            best_recall = None
            best_f1 = None
            lr = learning_rate
            for epoch in range(epochs):
                if epoch%20 == 0:
                    lr = lr/10
                    optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
                train_loss = train(model, device, train_loader, optimizer)
                #_ , train_accuracy, ____, __, ___ = test(model, device, train_loader, traindata = True)
                test_loss, test_accuracy, prec, recall, f1 = test(model, device, test_loader, traindata = False)

            sum_test_accuracy += test_accuracy
            sum_prec += prec
            sum_recall += recall
            sum_f1 += f1
            #print(f1)

        print("Accuracy = " + str(sum_test_accuracy/5))
        print("Precison = " + str(sum_prec/5))
        print("Recall = " + str(sum_recall/5))
        print("F1 score = " + str(sum_f1/5))
        print()

Using device cuda
num workers: 2


In [30]:
print("Crossvalidated RNN Model with Instagram Data")
cross_validate(RNNNet, 'Dat4ModeINSTA.pickle', 0.001, 0, 20, 128)

print("Crossvalidated RNN Model with Vine Data")
cross_validate(RNNNet, 'Dat4ModelVINE.pickle', 0.001, 0, 20, 64)

Crossvalidated RNN Model with Instagram Data
(2211, 75, 300)
Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 70.79006772009029
Precison = 0.7383612604365677
Recall = 0.5222657952069717
F1 score = 0.5310631844999885

Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 72.41534988713319
Precison = 0.6518693255423891
Recall = 0.4705337690631809
F1 score = 0.49373237420

In [31]:
print("Crossvalidated GRU Model with Instagram Data")
cross_validate(GRUNet, 'Dat4ModeINSTA.pickle', 0.001, 0, 20, 128)

print("Crossvalidated GRU Model with Vine Data")
cross_validate(GRUNet, 'Dat4ModelVINE.pickle', 0.001, 0, 20, 64)

Crossvalidated GRU Model with Instagram Data
(2211, 75, 300)
Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 81.17381489841986
Precison = 0.7869912784313037
Recall = 0.5250653594771242
F1 score = 0.6262959886885534

Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 80.85778781038374
Precison = 0.743381670290441
Recall = 0.5830501089324619
F1 score = 0.646221588786

In [32]:
print("Trained RNN Model with Instagram Data tested on Unlabeled data")
train_model(RNNNet, 'Dat4ModeINSTA.pickle', 'Dat4ModelUNLAB.pickle', 0.001, 0, 20, 128)

print("Trained RNN Model with Vine Data tested on Unlabeled data")
train_model(RNNNet, 'Dat4ModelVINE.pickle', 'Dat4ModelUNLAB.pickle', 0.001, 0, 20, 64)

Trained RNN Model with Instagram Data tested on Unlabeled data
Accuracy = 78.80184331797236
Precison = 0
Recall = 0
F1 score = 0

Trained RNN Model with Vine Data tested on Unlabeled data
Accuracy = 79.26267281105991
Precison = 0
Recall = 0
F1 score = 0



In [35]:
print("Trained GRU Model with Instagram Data tested on Unlabeled data")
train_model(GRUNet, 'Dat4ModeINSTA.pickle', 'Dat4ModelUNLAB.pickle', 0.01, 0, 20, 128)

print("Trained GRU Model with Vine Data tested on Unlabeled data")
train_model(GRUNet, 'Dat4ModelVINE.pickle', 'Dat4ModelUNLAB.pickle', 0.01, 0, 20, 64)

Trained GRU Model with Instagram Data tested on Unlabeled data
Accuracy = 41.935483870967744
Precison = 0.15966386554621848
Recall = 0.4222222222222222
F1 score = 0.23170731707317074

Trained GRU Model with Vine Data tested on Unlabeled data
Accuracy = 48.8479262672811
Precison = 0.125
Recall = 0.24444444444444444
F1 score = 0.16541353383458646



In [36]:
print("Crossvalidated RNN Model with THIRD Instagram Data")
cross_validate(RNNNet, 'Dat4ModelINSTA25.pickle', 0.001, 0, 20, 128)

print("Crossvalidated GRU Model with THIRD Instagram Data")
cross_validate(GRUNet, 'Dat4ModelINSTA25.pickle', 0.001, 0, 20, 128)

print("Crossvalidated RNN Model with SIXTH Instagram Data")
cross_validate(RNNNet, 'Dat4ModelINSTA12.pickle', 0.001, 0, 20, 128)

print("Crossvalidated GRU Model with SIXTH Instagram Data")
cross_validate(GRUNet, 'Dat4ModelINSTA12.pickle', 0.001, 0, 20, 128)

Crossvalidated RNN Model with THIRD Instagram Data
(2211, 25, 300)
Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 83.97291196388262
Precison = 0.7907001254386733
Recall = 0.6509041394335512
F1 score = 0.7125910520426351

Length of dataset 2211
[Full dataset] Percentage of true labels 0.30574400723654455
[Fold 0] Percentage of true labels 0.3054298642533937
[Fold 1] Percentage of true labels 0.3054298642533937
[Fold 2] Percentage of true labels 0.3054298642533937
[Fold 3] Percentage of true labels 0.3054298642533937
[Fold 4] Percentage of true labels 0.30699774266365687
Accuracy = 83.61173814898419
Precison = 0.7701921635409578
Recall = 0.6626688453159041
F1 score = 0.71116

In [37]:
print("Crossvalidated RNN Model with THIRD Vine Data")
cross_validate(RNNNet, 'Dat4ModelVINE25.pickle', 0.001, 0, 20, 64)

print("Crossvalidated GRU Model with THIRD Vine Data")
cross_validate(GRUNet, 'Dat4ModelVINE25.pickle', 0.001, 0, 20, 64)

print("Crossvalidated RNN Model with SIXTH Vine Data")
cross_validate(RNNNet, 'Dat4ModelVINE12.pickle', 0.001, 0, 20, 64)

print("Crossvalidated GRU Model with SIXTH Vine Data")
cross_validate(GRUNet, 'Dat4ModelVINE12.pickle', 0.001, 0, 20, 64)

Crossvalidated RNN Model with THIRD Vine Data
(967, 25, 300)
Length of dataset 967
[Full dataset] Percentage of true labels 0.31230610134436404
[Fold 0] Percentage of true labels 0.31088082901554404
[Fold 1] Percentage of true labels 0.31088082901554404
[Fold 2] Percentage of true labels 0.31088082901554404
[Fold 3] Percentage of true labels 0.31088082901554404
[Fold 4] Percentage of true labels 0.31794871794871793
Accuracy = 77.42268041237114
Precison = 0.6979036403305678
Recall = 0.4981967213114754
F1 score = 0.5744183367505405

Length of dataset 967
[Full dataset] Percentage of true labels 0.31230610134436404
[Fold 0] Percentage of true labels 0.31088082901554404
[Fold 1] Percentage of true labels 0.31088082901554404
[Fold 2] Percentage of true labels 0.31088082901554404
[Fold 3] Percentage of true labels 0.31088082901554404
[Fold 4] Percentage of true labels 0.31794871794871793
Accuracy = 77.01030927835052
Precison = 0.7727285566330855
Recall = 0.4331693989071038
F1 score = 0.50280