In [None]:
import os

download_name = "imdb.zip"
if not os.path.exists("imdb.zip"):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a01/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()

name = "imdb"
if not os.path.exists(name):
    from zipfile import ZipFile
    with ZipFile(download_name) as zf:
        zf.extractall(path=name)
    
name = "stopwords.txt"
if not os.path.exists(name):
    name = "stopwords.txt"
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a01/{name}")
    with open(name, "wb") as fp:
        fp.write(response.content)

In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from itertools import chain

# 1. Read data

Path "data/imdb/pos" has 1000 txt files with positive movie reviews.  
Path "data/imdb/neg" has 1000 txt files with negative movie reviews.  
Each text filename incorporates counter. For example, "data/imdb/pos/pos009_29592.txt".  
Return dictionary with keys "pos" and "neg" where value of "pos" is a list of text filename content sorted by filename.

In [None]:
assert len(data["neg"]) == 1000
assert len(data["pos"]) == 1000

assert data["pos"][0][:30] == "films adapted from comic books"
assert data["neg"][-1][-30:] == "left with exactly the same . \n"

# 1.  Read stop words

Path "data/stopwords.txt" contains english stop words.  
Read stopwords from file and store them in a set.

In [None]:
assert len(stopwords) == 318
assert {w for w in stopwords if w.startswith("so")} == {'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere'}

# 2. Tokenization

Create function *tokenize* which: 
* for a given text 
* returns list of tokens in lower cases (token can contain only letters)

In [None]:
assert len(list(tokenize(data["pos"][0]))) == 649
assert len(list(tokenize(data["neg"][0]))) == 669

assert list(tokenize(data["pos"][0]))[:10] == ['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success']
assert list(tokenize(data["neg"][0]))[-10:] == ['crow', 'salvation', 'lost', 'highway', 'memento', 'the', 'others', 'stir',  'of', 'echoes']

# 3. Word counter

Create function *make_counter* which: 
* receives corpus (a list of strings) and set of stopwords,
* tokenize each string, 
* remove stop words and 
* returns dictionary whose keys are tokens and values frequencies.

Note: `Counter` class from `collections` module can be used  
`from collections import Counter`

In [None]:
assert len(make_counter(data["neg"][:2], stopwords)) == 284
assert len(make_counter(data["pos"][-2:], stopwords=[])) == 613

# 4. Term frequency - inverse term frequency

Create function *make_tfidf* which: 
* receives corpus and stopwords, and
* returns tensor whose size is #doc x #words

Note: Use `TfidfVectorizer` from `sklearn.feature_extraction.text`.  
Instantiate `TfidfVectorizer` by setting `stop_words` and `tokenizer` arguments.


In [None]:
assert make_tfidf(data["neg"], stopwords).shape == (1000, 26764) 
assert make_tfidf(data["pos"], stopwords).shape == (1000, 28699) 

assert torch.isclose(torch.sum(make_tfidf(data["neg"], stopwords)), torch.tensor(11988.8478, dtype=torch.float64))
assert torch.isclose(torch.sum(make_tfidf(data["pos"], stopwords)), torch.tensor(12237.4325, dtype=torch.float64))

## 5.1. Most common words

Count words from both positive and negative reviews by using `make_counter`.  
List top ten most common words.

In [None]:
counter = make_counter(data["pos"] + data["neg"], stopwords)
pd.DataFrame(counter.most_common(10), columns=["word", "freq"])


## 5.2. Plot word frequencies

Make scatter plot where:
* x axis represents words sorted by frequencies and
* y axis are frequencies (use log scale for this axis)

In [None]:
freqs = sorted(counter.values())

plt.scatter(range(len(freqs)), freqs, marker=".")
plt.ylabel("freq")
plt.xlabel("words")
plt.yscale("log")

plt.show()

## 6.1. TF-IDF tensor

Make TF-IDF tensor for positive and negative reviews. Tensor must have 2000 rows (1000 positive and 1000 negative reviews).  
Calculate ratio of non-zero values in the TF-IDF tensor.

In [None]:
assert tfidf[:1000].count_nonzero() == torch.tensor(242389)
assert tfidf[1000:].count_nonzero() == torch.tensor(218810)

assert torch.isclose(torch.sum(tfidf), torch.tensor(24080.7049, dtype=torch.float64))


## 6.2. TF-IDF tensor slicing

Create two tensors, one for positive and one for negative reviews by slicing previously made TF-IDF tensor of both reviews.

In [None]:
assert tfidf_pos.count_nonzero() == torch.tensor(242389)
assert tfidf_neg.count_nonzero() == torch.tensor(218810)

## 6.3. Similarity

Create similarity tensor whose (i, j) value is cosine similarity of i-th positive review and j-th negative review.  
Cosine similarity between two vectors $u$ and $v$ is 
$$cos(u, v) = \frac{uv}{\lVert u \rVert \lVert v \rVert} = \frac{\sum_{i=1}^{n} u_i v_i}{\sqrt{\sum_{i=1}^{n} u_i^2} \sqrt{\sum_{i=1}^{n} v_i^2}}$$

In [None]:
assert sim.shape == (tfidf_pos.shape[0], tfidf_neg.shape[0]), f"shape of sim tensor is {tuple(sim.shape)}, but must be (1000, 1000)"

assert torch.allclose(sim[:3,:3], torch.tensor([[0.0492, 0.0326, 0.0477],
                                                [0.0463, 0.0240, 0.0539],
                                                [0.0462, 0.0409, 0.0445]], dtype=torch.float64), atol=1e-04)

## 6.4. Most similar positive and negative review

Find most similar positive and negative review. 
Print their sorted tokenized text without stopwords.

In [None]:
assert (pos_i, neg_i) == (789, 697)

In [None]:
pos_tokens = sorted(tok for tok in tokenize(data["pos"][pos_i]) if tok not in stopwords)

print(" ".join(pos_tokens))

In [None]:
neg_tokens = sorted(tok for tok in tokenize(data["neg"][neg_i]) if tok not in stopwords)

print(" ".join(neg_tokens))