In [None]:
import numpy as np
import codecs
import os
import sys
import re
from sklearn.model_selection import train_test_split, KFold
from TurkishStemmer import TurkishStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from unicode_tr import unicode_tr
from collections import Counter
from random import shuffle
from math import log, inf
from operator import itemgetter

from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

import torch
cuda = torch.cuda.is_available()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("semi-supervised-pytorch/semi-supervised")
from models import ProdLDADeepGenerativeModel

In [None]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')

In [None]:
stemmer = TurkishStemmer()

## Data Retrieval and Preprocessing

In [None]:
def findfiles(path,flist):
    dirs = os.listdir(path)
    for df in dirs:
        if os.path.isdir(path+"/"+df):
            findfiles(path+"/"+df,flist)
        else:
            flist.append(path+"/"+df)

In [None]:
path = './data/42bin_haber/news'
categories = os.listdir(path)[:]
categories = ['ekonomi','kultur-sanat','magazin','saglik','siyaset','spor','teknoloji']
news_files = {}
for cat in categories:
    flist = []
    findfiles(path+"/"+cat,flist)
    news_files[cat] = flist[1:]

In [None]:
len(categories)

In [None]:
def preprocess(words,stop_words = stopwords.words('turkish'),url_regex=None):
    #Remove URLS
    if url_regex:
        words = [word for word in words if not re.match(url_regex,word)]

    #Remove trash characters
    words = [re.sub("\xad|\x95|\x80|\x82|\x93|\x94|\x91|\x92|\x96|^\'+|^\*+|^-+|\'+$", "", word) for word in words]

    #Remove nonalphanumeric
    words = [word for word in words if not re.match("\W", word)]

    #Lower all words
    words = [unicode_tr(word.strip()).lower() for word in words if word.strip()!=""]
    
    #Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    #Stemming
    words = [stemmer.stem(word) for word in words]
    
    return words

In [None]:
category_corpus = {}
test_files = {}
all_words = []
X = []
y = []
for cat in categories:
    print(cat)
    for f in news_files[cat]:
        with open(f) as file:
            lines = file.readlines()
            lines = ''.join(lines)

            words = word_tokenize(lines)
            words = preprocess(words)

            X.append(words)
            y.append(cat)

In [None]:
len(X)

In [None]:
len(y)

In [None]:
docs = [" ".join(x) for x in X]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
dtm = CountVectorizer(max_features=10000)

In [None]:
docs_all = dtm.fit_transform(docs)

In [None]:
repr_scores = calc_representativeness_scores(docs_all, tokenized=True)

In [None]:
repr_scores.shape

In [None]:
len(dtm.vocabulary_)

In [None]:
le = LabelEncoder()
encoded_labels = le.fit_transform(y)

In [None]:
idxes, = np.where(encoded_labels == 5)

In [None]:
idxes_to_remove = np.random.choice(idxes, 5000, replace=False)

In [None]:
plt.hist(np.delete(encoded_labels, idxes_to_remove), bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5])

In [None]:
plt.hist(encoded_labels, bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5])

In [None]:
ohe = OneHotEncoder()
labels = ohe.fit_transform(np.expand_dims(encoded_labels, -1))

In [None]:
docs_all = docs_all.todense()
labels = labels.todense()

In [None]:
docs_all = np.delete(docs_all, idxes_to_remove, axis=0)
labels = np.delete(labels, idxes_to_remove, axis=0)

In [None]:
repr_scores = np.delete(repr_scores, idxes_to_remove, axis=0)

In [None]:
row_ids = np.arange(len(docs_all))

In [None]:
X_train, X_valid, y_train, y_valid, row_ids_train, row_ids_valid= train_test_split(docs_all, labels, row_ids, test_size=0.20)
x_labelled, x_unlabelled, y_labelled, y_unlabelled, row_ids_labelled, row_ids_unlabelled = train_test_split(X_train, y_train, row_ids_train, train_size=0.02)

In [None]:
x_labelled.shape, x_unlabelled.shape, y_labelled.shape, y_unlabelled.shape

In [None]:
def create_batch(data, batch_size):
    x, y = data
    batch_idx = np.random.choice(x.shape[0], batch_size, replace=False)

    return torch.from_numpy(x[batch_idx]).float(), torch.from_numpy(y[batch_idx]).float()

def create_data_sets(labelled, unlabelled, batch_size):
    num_labelled = labelled[0].shape[0]
    num_unlabelled = unlabelled[0].shape[0]

    train_labelled = [create_batch(labelled, batch_size) for _ in range(num_labelled // batch_size)]
    train_unlabelled = [create_batch(unlabelled, batch_size) for _ in range(num_unlabelled // batch_size)]
    
    return train_labelled, train_unlabelled

def create_validation_set(validation, batch_size):
    num_validation = validation[0].shape[0]
    
    validation = [create_batch(validation, batch_size) for _ in range(num_validation // batch_size)]
    
    return validation

In [None]:
batch_size = 50

In [None]:
labelled, unlabelled = (x_labelled, y_labelled), (x_unlabelled, y_unlabelled)
validation = create_validation_set((X_valid, y_valid), 50)

# Generate MC samples

In [None]:
def mc_samples(num_mc_samples, model, x_batch):
    model.train()
    mc_samples_ = [model.classify(x_batch) for _ in range(num_mc_samples)]
    return torch.stack(mc_samples_)

In [None]:
def bald_acq(mc_samples):
    #expected_entropy = -(mc_samples * (mc_samples + 1e-10).log()).sum(dim=-1).mean(dim=0)  # [batch size]
    expected_entropy = -np.mean(np.sum(mc_samples * np.log(mc_samples + 1e-10), axis=-1), axis=0)
    #expected_p = mc_samples.mean(dim=0)
    expected_p = np.mean(mc_samples, axis=0)
    #entropy_expected_p = - (expected_p * (expected_p + 1e-10).log()).sum(dim=-1)  # [batch size]
    entropy_expected_p = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1)

    BALD_acq = entropy_expected_p - expected_entropy
    
    return BALD_acq

In [None]:
def query_new_data(num_data, num_mc_samples, model, unlabelled_data, batch_size):
    num_samples = unlabelled_data.shape[0]
    batch_results = []
    for i in range(0, num_samples, batch_size):
        unlabelled_data_batch = torch.from_numpy(unlabelled_data[i:i+batch_size]).float()
        if cuda:
            unlabelled_data_batch = unlabelled_data_batch.cuda()
        mc_samples_ = mc_samples(num_mc_samples, model, unlabelled_data_batch).cpu().detach().numpy()
        bald_acq_ = bald_acq(mc_samples_)
        batch_results.append(bald_acq_)
        #sorted_, indices = bald_acq_.sort()
    
    bald_acq_ = np.hstack(batch_results)
    indices = bald_acq_.argsort()
    return indices[::-1][:num_data]

# Model and Training

In [None]:
y_dim = len(categories)
z_dim = 50
h_dim = [50, 50]

num_topics = y_dim
a = 1.0
prior_mean = np.log(a) - np.mean(np.log(a))
prior_var = (((1.0 / a) * (1 - (2.0 / num_topics))) + (1.0 / (num_topics * num_topics)) * np.sum((1.0 / a)))

In [None]:
def initialize_model(in_dim, y_dim, z_dim, h_dim, prior_mean, prior_var):
    model = ProdLDADeepGenerativeModel([in_dim, y_dim, z_dim, h_dim], prior_mean, prior_var)
    
    return model

In [None]:
from inference import SVI, ImportanceWeightedSampler
from itertools import cycle
from torch.autograd import Variable

def train_semi_supervised(model, labelled, unlabelled, validation, cuda, epochs=4):
    # You can use importance weighted samples [Burda, 2015] to get a better estimate
    # on the log-likelihood.
    sampler = ImportanceWeightedSampler(mc=1, iw=1)

    def binary_cross_entropy(r, x):
        return -torch.sum(x * torch.log(r + 1e-8) + (1 - x) * torch.log(1 - r + 1e-8), dim=-1)

    if cuda:
        model = model.cuda()
    elbo = SVI(model, likelihood=binary_cross_entropy, sampler=sampler)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.999))


    if cuda: 
        model = model.cuda()

    alpha = 1.0 * len(unlabelled) / len(labelled)

    for epoch in range(epochs):
        model.train()
        total_loss, accuracy = (0, 0)
        for (x, y), (u, _) in zip(cycle(labelled), unlabelled):
            #x, y, u = torch.from_numpy(x).float(), torch.from_numpy(y).float(), torch.from_numpy(u).float()
            # Wrap in variables
            x, y, u = Variable(x), Variable(y), Variable(u)

            if cuda:
                # They need to be on the same device and be synchronized.
                x, y = x.cuda(device=1), y.cuda(device=1)
                u = u.cuda(device=1)

            # print(x.sum())
            L = -elbo(x, y)
            U = -elbo(u)

            # Add auxiliary classification loss q(y|x)
            logits = model.classify(x)

            # Regular cross entropy
            classication_loss = torch.sum(y * torch.log(logits + 1e-8), dim=1).mean()

            J_alpha = L - alpha * classication_loss + U

            J_alpha.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += J_alpha.data.item()
            accuracy += torch.mean((torch.max(logits, 1)[1].data == torch.max(y, 1)[1].data).float())

        if epoch % 1 == 0:
            model.eval()
            m = len(unlabelled)
            print("Epoch: {}".format(epoch))
            print("[Train]\t\t J_a: {:.2f}, accuracy: {:.2f}".format(total_loss / m, accuracy / m))

            total_loss, accuracy = (0, 0)
            for x, y in validation:
                x, y = Variable(x), Variable(y)

                if cuda:
                    x, y = x.cuda(device=1), y.cuda(device=1)

                L = -elbo(x, y)
                U = -elbo(x)

                logits = model.classify(x)
                classication_loss = -torch.sum(y * torch.log(logits + 1e-8), dim=1).mean()

                J_alpha = L + alpha * classication_loss + U

                total_loss += J_alpha.data.item()

                _, pred_idx = torch.max(logits, 1)
                _, lab_idx = torch.max(y, 1)
                accuracy += torch.mean((torch.max(logits, 1)[1].data == torch.max(y, 1)[1].data).float())

            m = len(validation)
            print("[Validation]\t J_a: {:.2f}, accuracy: {:.2f}".format(total_loss / m, accuracy / m))
            
    return total_loss / m, accuracy / m

# Active Learning

In [None]:
NUM_MC_SAMPLES = 10
NUM_QUERY = 100

In [None]:
def rearange_datasets(labelled, unlabelled, new_data):
    labelled_x, labelled_y = labelled
    unlabelled_x, unlabelled_y = unlabelled
    
    new_data_x, new_data_y = unlabelled_x[new_data], unlabelled_y[new_data]
    
    new_labelled_x = np.append(labelled_x, new_data_x, axis=0)
    new_labelled_y = np.append(labelled_y, new_data_y, axis=0)
    
    new_unlabelled_x = np.delete(unlabelled_x, new_data, axis=0)
    new_unlabelled_y = np.delete(unlabelled_y, new_data, axis=0)
    
    return (new_labelled_x, new_labelled_y), (new_unlabelled_x, new_unlabelled_y)

In [None]:
torch.cuda.set_device(1)
torch.cuda.current_device()

In [None]:
x_dim = 10000

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class ConvolutionalClassifier(nn.Module):
    def __init__(self):
        super(ConvolutionalClassifier, self).__init__()        
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=4)

        size = int((x_dim - 3) + 1)//4
        size = int((size - 3) + 1)//4
                
        self.fc1 = nn.Linear(32*size, 50)
        self.fc2 = nn.Linear(50, 7)
        
    def forward(self, x):
        batch, *_ = x.size()
        x = x.view(-1, 1, x_dim)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(x.view(batch, -1))
        x = self.fc1(x)
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

classifier = ConvolutionalClassifier()

In [None]:
torch.cuda.empty_cache()

In [None]:
errors = []
batch_size=50
for i in range(50):
    model = initialize_model(x_dim, y_dim, z_dim, h_dim, prior_mean, prior_var)
    #classifier = ConvolutionalClassifier()
    #model.classifier = classifier
    train_labelled, train_unlabelled = create_data_sets(labelled, unlabelled, batch_size)
    
    print(labelled[0].shape[0], unlabelled[0].shape[0])
    error, acc = train_semi_supervised(model, train_labelled, train_unlabelled, validation, cuda, epochs=10)
    errors.append((labelled[0].shape[0], unlabelled[0].shape[0], error, acc))
    
    new_data = query_new_data(NUM_QUERY, NUM_MC_SAMPLES, model, unlabelled[0], batch_size)
    labelled, unlabelled = rearange_datasets(labelled, unlabelled, new_data)
    torch.cuda.empty_cache()

In [None]:
num_labelled = []
num_unlabelled = []
cost = []
acc = []

for n_l, n_u, c, a in errors:
    num_labelled.append(n_l)
    num_unlabelled.append(n_u)
    cost.append(c)
    acc.append(a)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, cost)
plt.xlabel("Number of labelled data")
plt.ylabel("Cost")
plt.savefig("n_labelled_vs_cost-turkish_news.png")

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, acc)
plt.xlabel("Number of labelled data")
plt.ylabel("Acc")
plt.savefig("n_labelled_vs_acc-turkish_news.png")

In [None]:
errors_2 = [(n_l, n_u, c, a.item()) for n_l, n_u, c, a in errors]

In [None]:
import pickle

In [None]:
with open("TN-data-erros.pkl", "wb") as f:
    pickle.dump(errors_2, f)

# Representativeness

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize


def add_vector_sparse(X,v):
    rows, cols = X.shape
    row_start_stop = np.lib.stride_tricks.as_strided(X.indptr, shape=(rows, 2),
                            strides=2*X.indptr.strides)
    for row, (start, stop) in enumerate(row_start_stop):
        data = X.data[start:stop]
        data += v[row]

def calc_representativeness_scores(corpus, lambda_=0.9, B=0.3, tokenized=False):
    
    if tokenized:
        X = corpus
    else:
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
    
    D = X.shape[0]
    
    p_w = X.sum(axis=0) / X.sum()
    p_w_given_d_i = normalize(X, norm='l1', axis=1)
    
    log_p_w = np.log(p_w)
    dist_disjoint = p_w_given_d_i.dot((1-lambda_) * log_p_w.T)
    
    p_w = np.squeeze(np.asarray(p_w))
    
    log_p_w_given_d_i = p_w_given_d_i.transpose().copy()

    log_p_w_given_d_i.data = lambda_*log_p_w_given_d_i.data 
    add_vector_sparse(log_p_w_given_d_i, lambda_*p_w)

    log_p_w_given_d_i.data = np.log(log_p_w_given_d_i.data)
    add_vector_sparse(log_p_w_given_d_i, -(1-lambda_)*np.log(p_w))
    
    log_p_w_given_d_i = log_p_w_given_d_i.transpose()
    
    dist_common = p_w_given_d_i.dot(log_p_w_given_d_i.sum(axis=0).T)
    
    dist_common = np.squeeze(np.asarray(dist_common))
    dist_disjoint = np.squeeze(np.asarray(dist_disjoint))
    
    dist_all = dist_common + dist_disjoint
    
    entropy = p_w_given_d_i.copy()
    entropy.data = p_w_given_d_i.data*np.log(p_w_given_d_i.data)

    entropy = np.squeeze(np.asarray(entropy.sum(axis=1)))
    
    kl_sum = dist_all - D * entropy
    kl_sum *= (B / D) 
    z_i = np.exp(kl_sum)
    
    return z_i

In [None]:
def query_new_data(num_data, num_mc_samples, model, unlabelled_data, repr_scores=None):
    unlabelled_data = torch.from_numpy(unlabelled_data).float()
    if cuda:
        unlabelled_data = unlabelled_data.cuda()
    mc_samples_ = mc_samples(num_mc_samples, model, unlabelled_data).cpu().detach().numpy()
    bald_acq_ = bald_acq(mc_samples_)
    
    if repr_scores is not None:
        bald_acq_ = bald_acq_ * repr_scores
    #sorted_, indices = bald_acq_.sort()
    indices = bald_acq_.argsort()
    return indices[::-1][:num_data]

In [None]:
errors = []
batch_size=50
for i in range(50):
    model = initialize_model(x_dim, y_dim, z_dim, h_dim, prior_mean, prior_var)
    #classifier = ConvolutionalClassifier()
    #model.classifier = classifier
    train_labelled, train_unlabelled = create_data_sets(labelled, unlabelled, batch_size)
    
    print(labelled[0].shape[0], unlabelled[0].shape[0])
    error, acc = train_semi_supervised(model, train_labelled, train_unlabelled, validation, cuda, epochs=5)
    errors.append((labelled[0].shape[0], unlabelled[0].shape[0], error, acc))
    
    new_data = query_new_data(NUM_QUERY, NUM_MC_SAMPLES, model, unlabelled[0], repr_scores=repr_scores[row_ids_unlabelled])
    #new_data = np.random.choice(unlabelled[0].shape[0], NUM_QUERY)
    row_ids_unlabelled = np.delete(row_ids_unlabelled, new_data, axis=0)
    labelled, unlabelled = rearange_datasets(labelled, unlabelled, new_data)
    torch.cuda.empty_cache()

In [None]:
num_labelled = []
num_unlabelled = []
cost = []
acc = []

for n_l, n_u, c, a in errors:
    num_labelled.append(n_l)
    num_unlabelled.append(n_u)
    cost.append(c)
    acc.append(a)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, cost)
plt.xlabel("Number of labelled data")
plt.ylabel("Cost")
plt.savefig("n_labelled_vs_cost-turkish_news-represent.png")

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, acc)
plt.xlabel("Number of labelled data")
plt.ylabel("Acc")
plt.savefig("n_labelled_vs_acc-turkish_news-represent.png")

In [None]:
errors_2 = [(n_l, n_u, c, a.item()) for n_l, n_u, c, a in errors]

In [None]:
import pickle

In [None]:
with open("plots/TN-data-erros-represent.pkl", "wb") as f:
    pickle.dump(errors_2, f)