In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import numpy as np

In [2]:
# list_s = ["This concept of distance is not restricted to two dimensions.", "This concept of distance is not restricted to two dimensions."]
corpus = [
        'This concept of distance is not restricted to two dimension.',
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?',
        'It is not difficult to imagine the figure above translated into three dimensions.',
        'We can persuade ourselves that the measure of distance extends to an arbitrary number of dimensions;',
        ]

In [3]:
stemmer = PorterStemmer()

def stem(text):
    text_stem = [stemmer.stem(token) for token in text.split(' ')]
    text_stem_join = ' '.join(text_stem)
    return text_stem_join

corpus_stem = list(map(stem, corpus))

## TODO - Show the tfidf.get_stop_words()

In [20]:
tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False, stop_words='english')

X = tfidf.fit_transform(corpus_stem)
print(tfidf.get_feature_names())
print(X.toarray())

['abov', 'arbitrari', 'concept', 'difficult', 'dimension', 'dimensions', 'distanc', 'document', 'extend', 'figur', 'imagin', 'measur', 'number', 'ourselv', 'persuad', 'restrict', 'second', 'thi', 'translat']
[[ 0.          0.          0.50865318  0.          0.50865318  0.
   0.38897149  0.          0.          0.          0.          0.          0.
   0.          0.          0.50865318  0.          0.26928979  0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.7640961   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.64510243  0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.29006559  0.          0.          0.          0.          0.          0.
   0.          0.          0.92514281  0.2448933   0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.    

In [21]:
print('cosine distance 0 -> 1: ', cosine(X[0].toarray(), X[1].toarray()))
print('cosine distance 0 -> 2: ', cosine(X[0].toarray(), X[2].toarray()))
print('cosine similarity 0 -> 1: ', cosine_similarity(X[0].toarray(), X[1].toarray()))
print('cosine similarity 0 -> 2: ', cosine_similarity(X[0].toarray(), X[2].toarray()))
print('cosine similarity 0 -> 3: ', cosine_similarity(X[0].toarray(), X[3].toarray()))
print('cosine similarity 0 -> 4: ', cosine_similarity(X[0].toarray(), X[4].toarray()))
print('cosine similarity 0 -> 5: ', cosine_similarity(X[0].toarray(), X[5].toarray()))
print('cosine similarity 0 -> 6: ', cosine_similarity(X[0].toarray(), X[6].toarray()))
# print(X)

cosine distance 0 -> 1:  0.826280498813
cosine distance 0 -> 2:  0.934052732742
cosine similarity 0 -> 1:  [[ 0.1737195]]
cosine similarity 0 -> 2:  [[ 0.06594727]]
cosine similarity 0 -> 3:  [[ 0.]]
cosine similarity 0 -> 4:  [[ 0.1737195]]
cosine similarity 0 -> 5:  [[ 0.]]
cosine similarity 0 -> 6:  [[ 0.11108811]]


# TODO - Create a sparse matrix of similarities between every article

# TODO - Plot the sparse matrix of similarities using a heatmap

# TODO - Create a wordcloud of the feature_names

In [93]:
dataset_dir = 'dataset/'
filename = 'story_plots_AA.txt'
separator = '<EOS>'

with open(dataset_dir + filename, 'r') as file:
    corpus = file.readlines()
    corpus = corpus[:-1]
    corpus = ''.join(corpus)
    corpus = corpus.split("<EOS>")


corpus_stem = list(map(stem, corpus))
tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=False, stop_words='english')

X = tfidf.fit_transform(corpus_stem)
# print(tfidf.get_feature_names())
# print(X.toarray())

N = len(X.toarray())
a = np.zeros(shape=(N,N))
for i in range(N):
    for j in range(N):
        a[i][j] = cosine_similarity(X[i].toarray(), X[j].toarray())[0][0]
        
df = pd.DataFrame(data=a)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,1.0,0.01462,0.03213,0.032563,0.017297,0.01507,0.035305,0.016205,0.019628,0.019549,...,0.019277,0.011591,0.019662,0.015654,0.024832,0.018403,0.019891,0.019999,0.011042,0.024427
1,0.01462,1.0,0.029778,0.030944,0.02072,0.022432,0.036014,0.0182,0.019568,0.023523,...,0.034594,0.015246,0.020325,0.016751,0.023272,0.016368,0.018685,0.018161,0.020093,0.035046
2,0.03213,0.029778,1.0,0.066226,0.036943,0.031403,0.065506,0.028701,0.039239,0.030849,...,0.03317,0.018685,0.032358,0.025155,0.038645,0.024923,0.028372,0.035315,0.024307,0.040421
3,0.032563,0.030944,0.066226,1.0,0.044317,0.039795,0.11338,0.072492,0.058612,0.03995,...,0.052293,0.028155,0.053332,0.038874,0.055759,0.049453,0.03797,0.042873,0.058735,0.054051
4,0.017297,0.02072,0.036943,0.044317,1.0,0.01594,0.047575,0.02291,0.050548,0.036349,...,0.034595,0.013046,0.037634,0.031151,0.025615,0.021572,0.026737,0.029003,0.012626,0.030277
5,0.01507,0.022432,0.031403,0.039795,0.01594,1.0,0.041069,0.023001,0.023062,0.027489,...,0.053923,0.044018,0.023832,0.027862,0.031136,0.01421,0.039855,0.028813,0.016181,0.034415
6,0.035305,0.036014,0.065506,0.11338,0.047575,0.041069,1.0,0.042164,0.054645,0.06506,...,0.075082,0.03454,0.066066,0.034369,0.058226,0.050295,0.051479,0.094498,0.028591,0.05301
7,0.016205,0.0182,0.028701,0.072492,0.02291,0.023001,0.042164,1.0,0.028098,0.01756,...,0.033278,0.021962,0.027428,0.06328,0.038752,0.023998,0.01894,0.019808,0.048559,0.021329
8,0.019628,0.019568,0.039239,0.058612,0.050548,0.023062,0.054645,0.028098,1.0,0.041815,...,0.036695,0.021657,0.034741,0.021248,0.038229,0.027615,0.029964,0.034603,0.009737,0.028781
9,0.019549,0.023523,0.030849,0.03995,0.036349,0.027489,0.06506,0.01756,0.041815,1.0,...,0.045642,0.014664,0.026214,0.024857,0.027257,0.023456,0.030429,0.030202,0.009839,0.033818
