In [18]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data
import string

import pandas as pd
import gzip

sys.path.append("..") 
print(torch.__version__)

1.0.0


In [275]:
# Please unzip all the zip file before runing this cell

# IMDB movie reviews
imdb = pd.read_csv('imdb_master2.csv')
imdb_text = imdb['review'].tolist()

In [235]:
np.unique(imdb['label'])

array(['neg', 'pos', 'unsup'], dtype=object)

In [23]:
# recent news
news = pd.read_csv('news_summary/news_summary2.csv')

# content
news_ctext = news['ctext'].tolist()

# summaries
news_text = news['text'].tolist()

In [24]:
# novel "the Hunger Games"
with open('the_hunger_games.txt', 'r') as f:
    lines_hg = f.readlines()

In [25]:
# novel "Catching Fire"
with open('catching_fire.txt', 'r') as g:
    lines_cf = g.readlines()

In [28]:
def clean_novel(lines):
    import re
    import string
    txt = ''.join(lines)
    sentences = [x for x in map(str.strip, re.split(',|\.|\n|\?|;|!', txt)) if x]
    pre_words = [x.translate(str.maketrans('', '', string.punctuation)).lower().split() for x in sentences]
    
    # only select sentences that have more than 3 words
    words = list(filter(lambda x: len(x) > 3, pre_words))
    return(words)

In [196]:
thg = clean_novel(lines_hg)
cf = clean_novel(lines_cf)

hg = thg + cf
len(hg)

21377

In [198]:
news_sum = clean_novel(news_text)
len(news_sum)

20366

In [253]:
reviews =  clean_novel(imdb_text)[:20000]
len(reviews)

20000

## Pytorch Framework

In [116]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

In [117]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [118]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

In [119]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives

    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

In [120]:
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))

In [121]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

In [122]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self): # none mean sum
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        """
        input – Tensor shape: (batch_size, len)
        target – Tensor of the same shape as input
        """
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1)

In [123]:
def sigmd(x):
    return - math.log(1 / (1 + math.exp(-x)))

In [124]:
def train(net, lr, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]

            pred = skip_gram(center, context_negative, net[0], net[1])

            l = (loss(pred.view(label.shape), label, mask) *
                 mask.shape[1] / mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
        if (epoch + 1) % 5 == 0:
            print('epoch %d, loss %.2f, time %.2fs' % (epoch + 1, l_sum / n, time.time() - start))

In [184]:
def get_similar_tokens(query_token, k, embed, n_components=3, random_state=1):
    W = torch.tensor(PCA(n_components=n_components, random_state=random_state).fit_transform(embed.weight.data))
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

In [186]:
def triple_similar_tokens(one, two, three, k, embed, n_components=3, random_state=1):
    W = torch.tensor(PCA(n_components=n_components, random_state=random_state).fit_transform(embed.weight.data))
    x = W[token_to_idx[one]] - W[token_to_idx[two]] + W[token_to_idx[three]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

### IMDB movie review

In [254]:
counter = collections.Counter([tk for st in reviews for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [255]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in reviews]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 208910'

In [256]:
np.random.seed(1)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 72941'

In [257]:
words = set([idx_to_token[x] for y in subsampled_dataset for x in y])
len(words)

3850

In [258]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [259]:
sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [260]:
batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4

dataset = MyDataset(all_centers, 
                    all_contexts, 
                    all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, 
                            num_workers=num_workers)

In [261]:
loss = SigmoidBinaryCrossEntropyLoss()

In [262]:
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

In [263]:
train(net, 0.01, 50)

train on cpu
epoch 5, loss 0.52, time 24.72s
epoch 10, loss 0.28, time 24.77s
epoch 15, loss 0.22, time 24.58s
epoch 20, loss 0.19, time 26.69s
epoch 25, loss 0.17, time 25.19s
epoch 30, loss 0.15, time 24.79s
epoch 35, loss 0.15, time 24.93s
epoch 40, loss 0.14, time 24.78s
epoch 45, loss 0.14, time 24.74s
epoch 50, loss 0.13, time 25.39s


## Sentiment Analysis with Logistic Regression

In [276]:
imdb_text_clean = ["".join(y for y in x.translate(str.maketrans('', '', string.punctuation)).lower() if y not in string.digits)
                   for x in imdb_text]

In [279]:
len(imdb_text_clean)

100000

In [278]:
label = imdb["label"]

100000

In [289]:
def transform(sentence):
    res = np.zeros(embed_size)
    num = 0
    for word in sentence.split():
        if word in words:
            res += np.array(net[0].weight.data[token_to_idx[word]])
            num += 1
    return res/num if num > 0 else res

In [294]:
from tqdm import tqdm

imdb_text_transform = np.zeros(shape=(len(imdb_text_clean), embed_size))

with tqdm(total=len(imdb_text_clean)) as pbar:
    for i in range(len(imdb_text_clean)):
        imdb_text_transform[i] = transform(imdb_text_clean[i])
        pbar.update(1)

100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [04:27<00:00, 374.11it/s]


In [299]:
from sklearn.linear_model import LogisticRegression

In [309]:
imdb_text_transform = imdb_text_transform[label!="unsup"]
label = label[label!="unsup"]
print(imdb_text_transform.shape)
print(label.shape)

(50000, 100)
(50000,)


In [315]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imdb_text_transform, label, random_state=0)

In [318]:
lr = LogisticRegression(solver="lbfgs", max_iter=1000)
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.70768
0.70048


In [330]:
from sklearn.metrics import confusion_matrix

confusion_matrix(lr.predict(imdb_text_transform), label)

array([[17550,  7256],
       [ 7450, 17744]], dtype=int64)

In [329]:
neg_prob = lr.predict_proba(imdb_text_transform[0:1000])[:, 0]
print(imdb_text[np.argmax(neg_prob)])
print(imdb_text[np.argmin(neg_prob)])

I thought this movie was horrible. I was bored and had to use all the self control I have to not scream at the screen. Mod Squad was beyond cheesy, beyond cliche, and utterly predictable.
Sorry to disagree with you, but I found the DKC series to be quite engaging. So much so that I invested in the SNES system and my own copies of the games. This is, mind you, almost ten years after the initial release of DKC 1. The graphics were ground-breaking for their time, the first vector graphics games for home systems. The music and characters are all memorable, and the games brought myself and my girlfriend dozens of hours of entertainment. True, the second game was better than the first, and the third was perhaps lacking the 'edge' of the second installment. But all three offered different play, and I enjoy them to this day. By the way, I'm old enough to remember when there were NO video games whatsoever (and TVs were black and white!).
