In [78]:
from __future__ import print_function
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill

from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [79]:
RSEED = 0
bow = 'tfid' # tf,tfid
stem_type = 'lemma' # snow, lemma
n_gram = '1gm' # 1gm or 2gm
topic_model = 'nmf' #lda, nmf

In [80]:
file_link = '../data/samples/fifty_shades.txt'

### 1. PIPELINE 1 - vectorize and topic model 

In [81]:
# open dill and wordnet_lemmatizer / snowball
vectorizer = dill.load(open('../data/vectors/'+bow+'_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))
wordnet_lemmatizer = WordNetLemmatizer()
snow = SnowballStemmer('english')

In [82]:
vector = vectorizer.transform([file_link])

In [83]:
model = dill.load(open('../data/vectors/'+ topic_model +'_'+ stem_type + '_' + n_gram,'rb'))
topic_vector = model.transform(vector)

In [84]:
# place in panda for easy manipulation
df_excerpt_a = pd.DataFrame(topic_vector, columns=['topic_'+ str(i)for i in range(1,16)])
df_excerpt_a

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,0.029896,0.001828,0.003948,0.003178,0.0,0.001237,0.0,0.000559,0.0,0.010945,0.0,0.0,0.00205,0.0,0.005529


### 2. PIPELINE 2 - sentiment analysis and word counts

In [85]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
file=open(file_link)
t=file.read()
book_excerpt = TextBlob(t)
word_count = len(book_excerpt.words)
sentence_count =len(book_excerpt.sentences)
avg_len = word_count/sentence_count
sentiment_excerpt = [[word_count,sentence_count,avg_len,book_excerpt.sentiment[0],book_excerpt.sentiment[1]]]

In [86]:
sentiment_excerpt

[[590, 47, 12.553191489361701, 0.007202380952380952, 0.4518614718614719]]

In [87]:
df_excerpt_b = pd.DataFrame(sentiment_excerpt, \
                            columns = ['word_count','sentence_count','sentence_length','polarity','subjectivity'])

In [88]:
df_excerpt_b

Unnamed: 0,word_count,sentence_count,sentence_length,polarity,subjectivity
0,590,47,12.553191,0.007202,0.451861


In [89]:
# join 

In [90]:
df_excerpt = pd.concat([df_excerpt_a,df_excerpt_b], axis=1)

In [91]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,word_count,sentence_count,sentence_length,polarity,subjectivity
0,0.029896,0.001828,0.003948,0.003178,0.0,0.001237,0.0,0.000559,0.0,0.010945,0.0,0.0,0.00205,0.0,0.005529,590,47,12.553191,0.007202,0.451861


In [92]:
# tranform
from sklearn import preprocessing



# log transform the counts 
column_names_to_log_1 = ['word_count', 'sentence_count', 'sentence_length']

df_excerpt.loc[:,column_names_to_log_1] = df_excerpt.loc[:,column_names_to_log_1].apply(np.log)


# normalize subjectivity and polarity
column_names_to_normalize = ['subjectivity', 'polarity']

# load the scaler 
min_max_scaler = dill.load(open('../data/vectors/scaler','rb'))

x = df_excerpt[column_names_to_normalize].values
x_scaled = min_max_scaler.transform(x) # only transform
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df_excerpt.index)
df_excerpt[column_names_to_normalize] = df_temp

# log transform topics
df_excerpt.loc[:,'topic_1':'topic_15'] = df_excerpt.loc[:,'topic_1':'topic_15'].apply(np.exp)

In [93]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,word_count,sentence_count,sentence_length,polarity,subjectivity
0,1.030348,1.001829,1.003956,1.003183,1.0,1.001237,1.0,1.000559,1.0,1.011005,1.0,1.0,1.002052,1.0,1.005544,6.380123,3.850148,2.529975,0.227907,0.393448


In [94]:
# load the corpus vector from train
corpus = pd.read_csv('../data/final_train.csv')
corpus = corpus.drop(columns ='Unnamed: 0')
corpus.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_11,topic_12,topic_13,topic_14,topic_15,word_count,sentence_count,sentence_length,polarity,subjectivity
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,1.050991,1.0,1.013402,1.031533,1.098676,1.000042,1.0,...,1.0,1.0,1.024405,1.016348,1.037875,10.312314,7.71913,2.593184,0.463344,0.43898
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,1.002099,1.0,1.008128,1.075021,1.031305,1.0,1.0,...,1.0,1.0,1.280568,1.000047,1.0,10.936636,8.395477,2.541159,0.481264,0.534175
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,1.060401,1.0,1.011625,1.012453,1.114378,1.0,1.014051,...,1.0,1.004718,1.016062,1.0,1.0,10.587266,8.02027,2.566996,0.506857,0.48805
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,1.048025,1.000477,1.0,1.021319,1.0,1.0,1.003149,...,1.0,1.023535,1.0,1.0,1.008584,9.817221,7.224753,2.592468,0.39793,0.594432
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,1.056361,1.001288,1.02447,1.011103,1.13393,1.0,1.008089,...,1.030547,1.0,1.0,1.0,1.005419,9.96086,6.907755,3.053105,0.40481,0.465239


In [95]:
# columns to drop before finding similarity
drop_cols =['word_count','sentence_count']
corpus = corpus.drop(columns =drop_cols)
df_excerpt = df_excerpt.drop(columns = drop_cols)

In [96]:
print(corpus.shape)
print(df_excerpt.shape)

(2428, 21)
(1, 18)


In [97]:
# shape em up to numpy arrays
given_excerpt = np.array(df_excerpt)
search_in = np.array(corpus.iloc[:,3:])

In [98]:
print(given_excerpt.shape)
print(search_in.shape)

(1, 18)
(2428, 18)


In [99]:
from sklearn.metrics.pairwise import cosine_similarity
results = cosine_similarity(search_in, given_excerpt)

In [100]:
show_me = pd.DataFrame(results).sort_values(0, ascending=False).head(1)
show_me

Unnamed: 0,0
1793,0.999788


In [101]:
title = corpus.iloc[list(show_me.index),0:2].values
similar_book = title[0][0] + ' by '+ title[0][1]
similar_book

'The Well, The Lady of the Barge and Others, Part 4 by William Wymark Jacobs'

50 shades of gray<br>
Anna Christie	by Eugene O	<br>
Anna Christie is the story of a former prostitute who falls in love, but runs into difficulty in turning her life around.

The Blue Djinn of Babylon<br>
The Parasite	Sir Arthur Conan Doyle	<br>
The Parasite makes use of a form of mind control similar to the mesmerism of the Victorian era; it works on some hosts but not others.
