# PART 4 - COSINE SIMILARITY
This notebook finds the cosine similarity between a sample excerpt and the books in our corpus. The logic of this notebook is what is used in the flask app.

Key point of difference uses an excerpt stored as a file in the ../data/samples/ folder while the flask app picks up the excerpt from the text area in the web browser. 

<font color ='blue'> runs fairly fast.. </font>


In [31]:
from __future__ import print_function
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill

from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [32]:
RSEED = 0
bow = 'tf' # tf,tfid
stem_type = 'lemma' # snow, lemma
n_gram = '1gm' # 1gm or 2gm
topic_model = 'lda' #lda, nmf

In [33]:
file_link = '../data/samples/isacc.txt'

### 1. PIPELINE 1 - vectorize and topic model 

In [34]:
# open dill and wordnet_lemmatizer / snowball
vectorizer = dill.load(open('../data/vectors/'+bow+'_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))
wordnet_lemmatizer = WordNetLemmatizer()
snow = SnowballStemmer('english')



In [35]:
vector = vectorizer.transform([file_link])

In [36]:
model = dill.load(open('../data/vectors/'+ topic_model +'_'+ stem_type + '_' + n_gram,'rb'))
topic_vector = model.transform(vector)

In [37]:
# place in panda for easy manipulation
df_excerpt_a = pd.DataFrame(topic_vector, columns=['topic_'+ str(i)for i in range(1,21)])
df_excerpt_a

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,0.000172,0.000172,0.08393,0.127902,0.000172,0.000172,0.000172,0.000172,0.336561,0.000172,0.223055,0.000172,0.000172,0.163666,0.062472,0.000172,0.000172,0.000172,0.000172,0.000172


### 2. PIPELINE 2 - sentiment analysis and word counts

In [38]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
file=open(file_link)
t=file.read()
book_excerpt = TextBlob(t)
word_count = len(book_excerpt.words)
sentence_count =len(book_excerpt.sentences)
avg_len = word_count/sentence_count
sentiment_excerpt = [[word_count,sentence_count,avg_len,book_excerpt.sentiment[0],book_excerpt.sentiment[1]]]

In [39]:
sentiment_excerpt

[[1027, 55, 18.672727272727272, 0.04467500734861846, 0.41442166372721917]]

In [40]:
df_excerpt_b = pd.DataFrame(sentiment_excerpt, \
                            columns = ['word_count','sentence_count','sentence_length','polarity','subjectivity'])

In [41]:
df_excerpt_b

Unnamed: 0,word_count,sentence_count,sentence_length,polarity,subjectivity
0,1027,55,18.672727,0.044675,0.414422


### 3. Pipeline 3 - join and transform 

In [42]:
df_excerpt = pd.concat([df_excerpt_a,df_excerpt_b], axis=1)

In [43]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,0.000172,0.000172,0.08393,0.127902,0.000172,0.000172,0.000172,0.000172,0.336561,0.000172,...,0.000172,0.000172,0.000172,0.000172,0.000172,1027,55,18.672727,0.044675,0.414422


In [44]:
# tranform
from sklearn import preprocessing



# log transform the counts 
column_names_to_log_1 = ['word_count', 'sentence_count', 'sentence_length']

df_excerpt.loc[:,column_names_to_log_1] = df_excerpt.loc[:,column_names_to_log_1].apply(np.log)


# normalize subjectivity and polarity
column_names_to_normalize = ['subjectivity', 'polarity']

# load the scaler 
min_max_scaler = dill.load(open('../data/vectors/scaler','rb'))

x = df_excerpt[column_names_to_normalize].values
x_scaled = min_max_scaler.transform(x) # only transform
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df_excerpt.index)
df_excerpt[column_names_to_normalize] = df_temp

# log transform topics
df_excerpt.loc[:,'topic_1':'topic_20'] = df_excerpt.loc[:,'topic_1':'topic_20'].apply(np.log)

In [45]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,-8.665613,-8.665613,-2.477773,-2.05649,-8.665613,-8.665613,-8.665613,-8.665613,-1.088975,-8.665613,...,-8.665613,-8.665613,-8.665613,-8.665613,-8.665613,6.934397,4.007333,2.927064,0.330301,0.257028


### 4. LOAD CORPUS

In [46]:
# load the corpus vector from final_full
corpus = pd.read_csv('../data/final_full.csv')
corpus = corpus.drop(columns ='Unnamed: 0')
corpus.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,-12.002609,-12.002609,-12.002609,-3.502501,-6.075743,-12.002609,-1.857118,...,-2.071984,-12.002609,-2.278422,-12.002609,-1.830291,10.312314,7.71913,2.593184,0.463344,0.43898
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,-12.689609,-3.643232,-12.689609,-12.689609,-12.689609,-12.689609,-3.300271,...,-2.972156,-1.706956,-12.689609,-12.689609,-2.023786,10.936636,8.395477,2.541159,0.481264,0.534175
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,-7.94893,-3.740827,-2.716502,-2.234353,-12.31824,-3.017201,-1.583218,...,-2.269965,-2.922566,-12.31824,-12.31824,-1.854198,10.587266,8.02027,2.566996,0.506857,0.48805
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,-11.631952,-11.631952,-0.835847,-2.470491,-2.973906,-11.631952,-3.732235,...,-11.631952,-11.631952,-11.631952,-11.631952,-5.846835,9.817221,7.224753,2.592468,0.39793,0.594432
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,-11.630886,-3.383129,-5.526673,-2.738946,-11.630886,-11.630886,-0.65501,...,-4.042037,-4.050035,-3.668777,-4.354673,-2.294989,9.96086,6.907755,3.053105,0.40481,0.465239


In [47]:
# columns to drop before finding similarity
# since word count , sentence count are meaningless given an excerpt \
#from user these should not be used in similarity finding

drop_cols =['word_count','sentence_count']
corpus = corpus.drop(columns =drop_cols)
df_excerpt = df_excerpt.drop(columns = drop_cols)

In [48]:
print(corpus.shape)
print(df_excerpt.shape)

(3035, 26)
(1, 23)


In [49]:
# shape em up to numpy arrays
given_excerpt = np.array(df_excerpt)
search_in = np.array(corpus.iloc[:,3:])

In [50]:
print(given_excerpt.shape)
print(search_in.shape)

(1, 23)
(3035, 23)


In [51]:
from sklearn.metrics.pairwise import cosine_similarity
results = cosine_similarity(search_in, given_excerpt)

In [52]:
show_me = pd.DataFrame(results).sort_values(0, ascending=False).head(5)
show_me

Unnamed: 0,0
2930,0.963346
1954,0.95289
198,0.948199
2497,0.946536
301,0.939464


In [53]:
title = corpus.iloc[list(show_me.index),0:2].values
print(title)
similar_book = title[0][0] + ' by '+ title[0][1]
print(similar_book)

[["Sentence Deferred, Sailor's Knots, Part 4" 'William Wymark Jacobs']
 ['A Spirit of Avarice, Odd Craft, Part 11' 'William Wymark Jacobs']
 ["The Boatswain's Mate, Captains All, Book 2" 'William Wymark Jacobs']
 ['Mr. Spaceship' 'Philip Kindred Dick']
 ['The Chemical History Of A Candle' 'Michael Faraday']]
Sentence Deferred, Sailor's Knots, Part 4 by William Wymark Jacobs


50 shades of gray<br>
Anna Christie	by Eugene O	<br>
Anna Christie is the story of a former prostitute who falls in love, but runs into difficulty in turning her life around.

The Blue Djinn of Babylon<br>
The Parasite	Sir Arthur Conan Doyle	<br>
The Parasite makes use of a form of mind control similar to the mesmerism of the Victorian era; it works on some hosts but not others.


### End of notebooks -> check out app.py