In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import numpy as np

In [2]:
# list_s = ["This concept of distance is not restricted to two dimensions.", "This concept of distance is not restricted to two dimensions."]
corpus = [
        'This concept of distance is not restricted to two dimension.',
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?',
        'It is not difficult to imagine the figure above translated into three dimensions.',
        'We can persuade ourselves that the measure of distance extends to an arbitrary number of dimensions;',
        ]

In [3]:
stemmer = PorterStemmer()

def stem(text):
    text_stem = [stemmer.stem(token) for token in text.split(' ')]
    text_stem_join = ' '.join(text_stem)
    return text_stem_join

corpus_stem = list(map(stem, corpus))

## TODO - Show the tfidf.get_stop_words()

In [4]:
tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False, stop_words='english')

X = tfidf.fit_transform(corpus_stem)
print(tfidf.get_feature_names())
print(X.toarray())

['abov', 'arbitrari', 'concept', 'difficult', 'dimension', 'dimensions', 'distanc', 'document', 'extend', 'figur', 'imagin', 'measur', 'number', 'ourselv', 'persuad', 'restrict', 'second', 'thi', 'translat']
[[ 0.          0.          0.50865318  0.          0.50865318  0.
   0.38897149  0.          0.          0.          0.          0.          0.
   0.          0.          0.50865318  0.          0.26928979  0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.7640961   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.64510243  0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.29006559  0.          0.          0.          0.          0.          0.
   0.          0.          0.92514281  0.2448933   0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.    

In [5]:
print('cosine distance 0 -> 1: ', cosine(X[0].toarray(), X[1].toarray()))
print('cosine distance 0 -> 2: ', cosine(X[0].toarray(), X[2].toarray()))
print('cosine similarity 0 -> 1: ', cosine_similarity(X[0].toarray(), X[1].toarray()))
print('cosine similarity 0 -> 2: ', cosine_similarity(X[0].toarray(), X[2].toarray()))
print('cosine similarity 0 -> 3: ', cosine_similarity(X[0].toarray(), X[3].toarray()))
print('cosine similarity 0 -> 4: ', cosine_similarity(X[0].toarray(), X[4].toarray()))
print('cosine similarity 0 -> 5: ', cosine_similarity(X[0].toarray(), X[5].toarray()))
print('cosine similarity 0 -> 6: ', cosine_similarity(X[0].toarray(), X[6].toarray()))
# print(X)

cosine distance 0 -> 1:  0.826280498813
cosine distance 0 -> 2:  0.934052732742
cosine similarity 0 -> 1:  [[ 0.1737195]]
cosine similarity 0 -> 2:  [[ 0.06594727]]
cosine similarity 0 -> 3:  [[ 0.]]
cosine similarity 0 -> 4:  [[ 0.1737195]]
cosine similarity 0 -> 5:  [[ 0.]]
cosine similarity 0 -> 6:  [[ 0.11108811]]


# TODO - Create a sparse matrix of similarities between every article

# TODO - Plot the sparse matrix of similarities using a heatmap

# TODO - Create a wordcloud of the feature_names

In [6]:
dataset_dir = 'dataset/'
plots_filename = 'story_plots_AA.txt'
titles_filename = 'story_titles_AA.txt'
separator = '<EOS>'

with open(dataset_dir + plots_filename, 'r') as file:
    corpus = file.readlines()
    corpus = corpus[:-1]
    corpus = ''.join(corpus)
    corpus = corpus.split(separator)

with open(dataset_dir + titles_filename, 'r') as file:
    titles = file.readlines()

In [7]:
corpus_stem = list(map(stem, corpus))
tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False, stop_words='english')

X = tfidf.fit_transform(corpus_stem)
# print(tfidf.get_feature_names())
# print(X.toarray())

N = len(X.toarray())
a = np.zeros(shape=(N,N))
for i in range(N):
    for j in range(N):
        a[i][j] = cosine_similarity(X[i].toarray(), X[j].toarray())[0][0]
        
# df = pd.DataFrame(data=a, columns=titles, index=titles)
df = pd.DataFrame(data=a)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,1.0,0.01462,0.03213,0.032563,0.017297,0.01507,0.035305,0.016205,0.019628,0.019549,...,0.019277,0.011591,0.019662,0.015654,0.024832,0.018403,0.019891,0.019999,0.011042,0.024427
1,0.01462,1.0,0.029778,0.030944,0.02072,0.022432,0.036014,0.0182,0.019568,0.023523,...,0.034594,0.015246,0.020325,0.016751,0.023272,0.016368,0.018685,0.018161,0.020093,0.035046
2,0.03213,0.029778,1.0,0.066226,0.036943,0.031403,0.065506,0.028701,0.039239,0.030849,...,0.03317,0.018685,0.032358,0.025155,0.038645,0.024923,0.028372,0.035315,0.024307,0.040421
3,0.032563,0.030944,0.066226,1.0,0.044317,0.039795,0.11338,0.072492,0.058612,0.03995,...,0.052293,0.028155,0.053332,0.038874,0.055759,0.049453,0.03797,0.042873,0.058735,0.054051
4,0.017297,0.02072,0.036943,0.044317,1.0,0.01594,0.047575,0.02291,0.050548,0.036349,...,0.034595,0.013046,0.037634,0.031151,0.025615,0.021572,0.026737,0.029003,0.012626,0.030277


In [8]:
# %%bash
# pip install nltk
# pip install seaborn

In [9]:
df[df == df.max()] = 0 # 1's become 0's
most_similar = np.array(df.apply(lambda x: df.columns[x.argmax()], axis = 1)) # column name of the highest value

In [10]:
for i in range(len(most_similar)):
    print('Most similar article to', titles[i], 'is:', titles[most_similar[i]])

Most similar article to Citizen Kane
 is: All Quiet on the Western Front

Most similar article to Animal Farm
 is: Batman (1966 film)

Most similar article to A Clockwork Orange (novel)
 is: The Big O

Most similar article to The Plague
 is: All Quiet on the Western Front

Most similar article to Actaeon
 is: A Funny Thing Happened on the Way to the Forum

Most similar article to A Fire Upon the Deep
 is: Babylon 5

Most similar article to All Quiet on the Western Front
 is: The Plague

Most similar article to Anyone Can Whistle
 is: The Plague

Most similar article to A Funny Thing Happened on the Way to the Forum
 is: The Plague

Most similar article to Army of Darkness
 is: All Quiet on the Western Front

Most similar article to The Birth of a Nation
 is: All Quiet on the Western Front

Most similar article to Blade Runner
 is: Blade Runner 2: The Edge of Human

Most similar article to Blazing Saddles
 is: A Clockwork Orange (novel)

Most similar article to Blue Velvet (film)
 is: B

In [11]:
print('Most similar articles:', titles[most_similar.argmax()], 'and', titles[most_similar.max()])
print('Plot of', titles[most_similar.argmax()])
print()
print(corpus[most_similar.argmax()])
print()
print()
print('Plot of', titles[most_similar.max()])
print()
print(corpus[most_similar.max()])

Most similar articles: Chapterhouse: Dune
 and Vladimir Harkonnen

Plot of Chapterhouse: Dune



The Bene Gesserit still find themselves questioning the Golden Path of humanity set by the God Emperor Leto II.
Now they must survive the Honored Matres, whose reckless conquest of the Old Empire threatens Bene Gesserit survival.
The Sisters must reassess their timeless methods: does ultimate survival go beyond calculated manipulation.
Is there greater purpose to life than consolidating power.
The situation is desperate for the Bene Gesserit as they find themselves the targets of the Honored Matres, whose conquest of the Old Empire is almost complete.
The Matres are seeking to assimilate the technology and developed methods of the Bene Gesserit and exterminate the Sisterhood itself.
Now in command of the Bene Gesserit, Mother Superior Darwi Odrade continues to develop her drastic, secret plan to overcome the Honored Matres.
The Bene Gesserit are also terraforming the planet Chapterhouse to 

In [13]:
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

ngram_vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(1, 1), min_df=1)
X = ngram_vectorizer.fit_transform(corpus[most_similar.max()])
vocab = list(ngram_vectorizer.get_feature_names())
counts = X.sum(axis=0).A1
freq_distribution = Counter(dict(zip(vocab, counts)))
print(freq_distribution.most_common(10))

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/bmartins/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

In [None]:
import seaborn as sns
%matplotlib

sns.heatmap(df)