# The dataset
---
Downloaded from https://dumps.wikimedia.org/enwiki/ and used this method https://github.com/markriedl/WikiPlots to extract the info and help recreate the corpus.

The raw data is a Wikipedia dump of English articles that contains a sub-header that contains the word **"plot"** (e.g., "Plot", "Plot Summary", etc.).

When the corpus is recreated we have two files:

* plots: a text file containing all story plots. Each story plot is given with one sentence per line. Each story is followed by **`<EOS>`** on a line by itself.
* titles: a text file containing a list of titles for each article in whih a story plot was found and extracted.

The dataset used was uploaded in 23-Mar-2017 14:24 and can be found [here](https://dumps.wikimedia.org/enwiki/). It's a 56Gb `.xml` file zipped as `.bz2` with 14Gb size.

When extracted the articles are separated by folders (i.e. "AA/", "AB/", "AC/"), with several stories in it.

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import numpy as np

# Stemming

***

In [6]:
stemmer = PorterStemmer()

def stem(text):
    text_stem = [stemmer.stem(token) for token in text.split(' ')]
    text_stem_join = ' '.join(text_stem)
    return text_stem_join

# Reading the dataset
***

For this example I used the story plots from the "AB/" folder.

In [7]:
dataset_dir = 'dataset/'
plots_filename = 'plots_full.txt'
titles_filename = 'titles_full.txt'
separator = '<EOS>'

with open(dataset_dir + plots_filename, 'r') as file:
    corpus = file.readlines()
    corpus = corpus[:-1]
    corpus = ''.join(corpus)
    corpus = corpus.split(separator)

with open(dataset_dir + titles_filename, 'r') as file:
    titles = file.readlines()

## Extracted stories
---

In [8]:
N = len(titles)
print('Total of extracted stories:', N)
# titles

Total of extracted stories: 28149


## Plot example (A Song of Ice and Fire)
---

In [9]:
# corpus[43]

# Fitting and transforming the stemmed corpus
---

In [10]:
corpus_stem = list(map(stem, corpus))
tfidf = TfidfVectorizer(norm='l2', use_idf=True, stop_words='english')

X = tfidf.fit_transform(corpus_stem)

# Extracted features
---

In [11]:
print('Total features extracted:', len(tfidf.get_feature_names()))
# print(tfidf.get_feature_names())

# print(X.toarray())

Total features extracted: 159562


# Cosine Similarities of NxN articles
---

In [None]:
def calculate_cosine_similarity(i, j):
    if (j > i):
        return
    a[i][j] = cosine_similarity(X[i].toarray(), X[j].toarray())[0][0]

In [None]:
# from itertools import product
from concurrent.futures import ThreadPoolExecutor

a = np.zeros(shape=(N,N))
print('Creating sparse matrix...')
# for i,j in zip(range(N), range(N)):
#     calculate_cosine_similarity(i, j)
# for i,j in product(range(N), range(N)):
with ThreadPoolExecutor(max_workers=2) as executor:
    for i in range(N):
        for j in range(N):
            executor.submit(calculate_cosine_similarity, i, j)
    
print('Done.')
print('Creating DataFrame...')
df = pd.DataFrame(data=a)
print('Done')
# df

Creating sparse matrix...


# Setting 1's to 0's and getting most similar articles
---

In [None]:
df[df == df.max()] = 0

most_similar = np.array(df.apply(lambda x: df.columns[x.argmax()], axis = 1))
most_similar_values = np.array(df.apply(lambda x: x.max(), axis = 1))

In [None]:
# most_similar_values
# np.sort(most_similar_values)[-1]

# Most similar articles
---

In [None]:
# for i in range(len(most_similar)):
#     print('Most similar article to', titles[i], 'is:', titles[most_similar[i]], 'with cosine similarity:', df[most_similar[i]].max())
#     print()

# Most similar articles

In [None]:
most_similar_articles_indexes = np.where(most_similar_values == np.sort(most_similar_values)[-1])[0]
top_plot_one_index = most_similar_articles_indexes[0]
top_plot_two_index = most_similar[most_similar_articles_indexes[0]]

top_title_one = titles[top_plot_one_index]
top_title_two = titles[most_similar[most_similar_articles_indexes[0]]]

top_corpus_one = corpus[most_similar_articles_indexes[0]]
top_corpus_two = corpus[most_similar[most_similar_articles_indexes[0]]]
print('Most similar articles:')
print(top_title_one, 'and', top_title_two, 'cos:', df.iloc[most_similar_articles_indexes[0]].max())
print()
print('Plot of', top_title_one)
print()
print(top_corpus_one)
print()
print()
print('Plot of', top_title_two)
print()
print(top_corpus_two)

# Wordcloud of most similar articles

In [None]:
# !pip install wordcloud
%matplotlib
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wc = WordCloud(background_color='white', max_words=30, stopwords=tfidf.get_stop_words())
wc_ = WordCloud(background_color='white', max_words=30, stopwords=tfidf.get_stop_words())

wc.generate(top_corpus_one)
wc_.generate(top_corpus_two)


fig = plt.figure()
a = fig.add_subplot(1,2,1)
imgplot = plt.imshow(wc)
plt.title(top_title_one)
plt.axis("off")

a = fig.add_subplot(1,2,2)
imgplot = plt.imshow(wc_)
plt.title(top_title_two)
plt.axis("off")

plt.show()

# Sparse matrix of similarity

In [None]:
import seaborn as sns

sns.heatmap(df)