In [188]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import numpy as np

# Stemming

In [189]:
stemmer = PorterStemmer()

def stem(text):
    text_stem = [stemmer.stem(token) for token in text.split(' ')]
    text_stem_join = ' '.join(text_stem)
    return text_stem_join

# Reading the dataset

In [190]:
dataset_dir = 'dataset/'
plots_filename = 'plots_AB.txt'
titles_filename = 'titles_AB.txt'
separator = '<EOS>'

with open(dataset_dir + plots_filename, 'r') as file:
    corpus = file.readlines()
    corpus = corpus[:-1]
    corpus = ''.join(corpus)
    corpus = corpus.split(separator)

with open(dataset_dir + titles_filename, 'r') as file:
    titles = file.readlines()

# Tf-idf of NxN articles

In [191]:
corpus_stem = list(map(stem, corpus))
tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False, stop_words='english')

X = tfidf.fit_transform(corpus_stem)

# print(tfidf.get_feature_names())
# print(X.toarray())

N = len(X.toarray())
a = np.zeros(shape=(N,N))
for i in range(N):
    for j in range(N):
        a[i][j] = cosine_similarity(X[i].toarray(), X[j].toarray())[0][0]
        
df = pd.DataFrame(data=a)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,1.000000,0.019076,0.052666,0.019383,0.018025,0.017022,0.017283,0.031032,0.017844,0.036315,...,0.023715,0.012530,0.024409,0.025717,0.020528,0.008696,0.015482,0.030056,0.024627,0.028876
1,0.019076,1.000000,0.012574,0.013552,0.017013,0.012699,0.017425,0.029323,0.013013,0.034748,...,0.002590,0.006913,0.010661,0.012588,0.012650,0.008580,0.011932,0.014111,0.013457,0.026432
2,0.052666,0.012574,1.000000,0.015598,0.022047,0.014120,0.016235,0.032556,0.020003,0.025494,...,0.008874,0.007232,0.016076,0.024340,0.018774,0.020582,0.020474,0.005352,0.024299,0.024871
3,0.019383,0.013552,0.015598,1.000000,0.098799,0.019975,0.118243,0.048638,0.044088,0.039705,...,0.075489,0.054940,0.060496,0.027327,0.031043,0.014032,0.014049,0.025536,0.023548,0.029104
4,0.018025,0.017013,0.022047,0.098799,1.000000,0.017013,0.049068,0.016696,0.026077,0.038070,...,0.060280,0.043570,0.031656,0.030876,0.017845,0.009372,0.017669,0.040640,0.020823,0.031497
5,0.017022,0.012699,0.014120,0.019975,0.017013,1.000000,0.023281,0.021658,0.015673,0.030075,...,0.010919,0.010860,0.011762,0.020275,0.121161,0.007213,0.016966,0.015378,0.018558,0.020720
6,0.017283,0.017425,0.016235,0.118243,0.049068,0.023281,1.000000,0.029798,0.044874,0.037248,...,0.031765,0.022020,0.039962,0.025330,0.028138,0.087500,0.010802,0.033993,0.055216,0.031981
7,0.031032,0.029323,0.032556,0.048638,0.016696,0.021658,0.029798,1.000000,0.067117,0.048999,...,0.019561,0.007172,0.010669,0.023863,0.027917,0.016749,0.016473,0.016990,0.020706,0.025728
8,0.017844,0.013013,0.020003,0.044088,0.026077,0.015673,0.044874,0.067117,1.000000,0.058202,...,0.016763,0.013562,0.013155,0.018196,0.036257,0.018661,0.017496,0.011704,0.028741,0.035001
9,0.036315,0.034748,0.025494,0.039705,0.038070,0.030075,0.037248,0.048999,0.058202,1.000000,...,0.029420,0.024605,0.021786,0.028417,0.021205,0.014584,0.039619,0.053496,0.043064,0.059454


# Setting 1's to 0's

In [192]:
df[df == df.max()] = 0 # 1's become 0's

most_similar = np.array(df.apply(lambda x: df.columns[x.argmax()], axis = 1)) # column name of the highest value
most_similar_values = np.array(df.apply(lambda x: x.max(), axis = 1))
# df.max(axis=1)
# print(df[5][0])
# print(df[20][0])

In [207]:
print(np.where(most_similar_values == np.sort(most_similar_values)[-1])[0])

46


# Most similar articles

In [194]:
for i in range(len(most_similar)):
    print('Most similar article to', titles[i], 'is:', titles[most_similar[i]], 'with cosine similarity:', df[most_similar[i]].max())
    print()

Most similar article to Day of the Tentacle
 is: Gunpowder Plot
 with cosine similarity: 0.605628311185

Most similar article to Doraemon
 is: Fawlty Towers
 with cosine similarity: 0.0562546063426

Most similar article to Dressed to Kill (1980 film)
 is: Glen or Glenda
 with cosine similarity: 0.0849503514096

Most similar article to Doom (1993 video game)
 is: Duke Nukem 3D
 with cosine similarity: 0.176464982712

Most similar article to Diablo II
 is: Doom (1993 video game)
 with cosine similarity: 0.118242877494

Most similar article to Dune Messiah
 is: Heretics of Dune
 with cosine similarity: 0.169355400427

Most similar article to Duke Nukem 3D
 is: Escape from New York
 with cosine similarity: 0.176464982712

Most similar article to Dr. Strangelove
 is: The Return of Godzilla
 with cosine similarity: 0.268947976008

Most similar article to Das Boot
 is: Galaxy Quest
 with cosine similarity: 0.104497704058

Most similar article to Death of a Hero
 is: Glen or Glenda
 with cosin

# Most similar articles

In [211]:
most_similar_articles_indexes = np.where(most_similar_values == np.sort(most_similar_values)[-1])[0]
print(most_similar_articles_indexes)
print('Most similar articles:')
print(titles[most_similar_articles_indexes[0]], 'and', titles[most_similar_articles_indexes[1]], 'cos:', df[most_similar_articles_indexes[0]].max())
print('Plot of', titles[most_similar_articles_indexes[0]])
print()
print(corpus[most_similar.argmax()])
print()
print()
print('Plot of', titles[most_similar_articles_indexes[1]])
print()
print(corpus[most_similar.max()])

[46 50]
Most similar articles:
Guy Fawkes
 and Gunpowder Plot
 cos: 0.605628311185
Plot of Guy Fawkes



Twelve years after the events described in Dune (1965), Paul "Muad'Dib" Atreides rules as Emperor.
By accepting the role of messiah to the Fremen, Paul had unleashed a jihad which conquered most of the known universe.
While Paul is the most powerful emperor ever known, he is powerless to stop the lethal excesses of the religious juggernaut he has created.
Although 61 billion people have perished, Paul's prescient visions indicate that this is far from the worst possible outcome for humanity.
Motivated by this knowledge, Paul hopes to set humanity on a course that will not inevitably lead to stagnation and destruction, while at the same time acting as ruler of the empire and focal point of the Fremen religion.
The Bene Gesserit, Spacing Guild, and Tleilaxu enter into a conspiracy to dethrone Paul, and the Bene Gesserit Reverend Mother Gaius Helen Mohiam enlists Paul's own consort Pri

In [213]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wc = WordCloud(background_color='white', max_words=30, stopwords=tfidf.get_stop_words())
wc_ = WordCloud(background_color='white', max_words=30, stopwords=tfidf.get_stop_words())

wc.generate(corpus[most_similar_articles_indexes[0]])
wc_.generate(corpus[most_similar_articles_indexes[1]])


fig = plt.figure()
a = fig.add_subplot(1,2,1)
imgplot = plt.imshow(wc)
plt.title(titles[most_similar_articles_indexes[0]])
plt.axis("off")

a = fig.add_subplot(1,2,2)
imgplot = plt.imshow(wc_)
plt.title(titles[most_similar_articles_indexes[1]])
plt.axis("off")

plt.show()

In [202]:
import seaborn as sns
%matplotlib

sns.heatmap(df)

Using matplotlib backend: TkAgg


<matplotlib.axes._subplots.AxesSubplot at 0x117e19da0>