In [1]:
from __future__ import print_function
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# LDA
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF


# bring in my pickled vectorizers
import pickle
import dill

from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
RSEED = 0
bow = 'tf' # tf,tfid
stem_type = 'lemma' # snow, lemma
n_gram = '1gm' # 1gm or 2gm
topic_model = 'lda' #lda, nmf

In [3]:
file_link = '../data/samples/the_blue_djinn_of_babylon.txt'

### 1. PIPELINE 1 - vectorize and topic model 

In [4]:
# open dill and wordnet_lemmatizer / snowball
vectorizer = dill.load(open('../data/vectors/'+bow+'_vectorizer_'+ stem_type + '_' + str(n_gram), 'rb'))
wordnet_lemmatizer = WordNetLemmatizer()
snow = SnowballStemmer('english')



In [5]:
vector = vectorizer.transform([file_link])

In [6]:
model = dill.load(open('../data/vectors/'+ topic_model +'_'+ stem_type + '_' + n_gram,'rb'))
topic_vector = model.transform(vector)



In [7]:
# place in panda for easy manipulation
df_excerpt_a = pd.DataFrame(topic_vector, columns=['topic_'+ str(i)for i in range(1,21)])
df_excerpt_a

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20
0,0.000229,0.000229,0.045523,0.000229,0.000229,0.000229,0.000229,0.000229,0.29005,0.020638,0.000229,0.000229,0.000229,0.000229,0.089696,0.149312,0.186939,0.000229,0.116019,0.099071


### 2. PIPELINE 2 - sentiment analysis and word counts

In [8]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
file=open(file_link)
t=file.read()
book_excerpt = TextBlob(t)
word_count = len(book_excerpt.words)
sentence_count =len(book_excerpt.sentences)
avg_len = word_count/sentence_count
sentiment_excerpt = [[word_count,sentence_count,avg_len,book_excerpt.sentiment[0],book_excerpt.sentiment[1]]]

In [9]:
sentiment_excerpt

[[988, 52, 19.0, 0.03766065103727441, 0.568975000468507]]

In [10]:
df_excerpt_b = pd.DataFrame(sentiment_excerpt, \
                            columns = ['word_count','sentence_count','sentence_length','polarity','subjectivity'])

In [11]:
df_excerpt_b

Unnamed: 0,word_count,sentence_count,sentence_length,polarity,subjectivity
0,988,52,19.0,0.037661,0.568975


In [12]:
# join 

In [13]:
df_excerpt = pd.concat([df_excerpt_a,df_excerpt_b], axis=1)

In [14]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,0.000229,0.000229,0.045523,0.000229,0.000229,0.000229,0.000229,0.000229,0.29005,0.020638,...,0.149312,0.186939,0.000229,0.116019,0.099071,988,52,19.0,0.037661,0.568975


In [15]:
# tranform
from sklearn import preprocessing



# log transform the counts 
column_names_to_log_1 = ['word_count', 'sentence_count', 'sentence_length']

df_excerpt.loc[:,column_names_to_log_1] = df_excerpt.loc[:,column_names_to_log_1].apply(np.log)


# normalize subjectivity and polarity
column_names_to_normalize = ['subjectivity', 'polarity']

# load the scaler 
min_max_scaler = dill.load(open('../data/vectors/scaler','rb'))

x = df_excerpt[column_names_to_normalize].values
x_scaled = min_max_scaler.transform(x) # only transform
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df_excerpt.index)
df_excerpt[column_names_to_normalize] = df_temp

# log transform topics
df_excerpt.loc[:,'topic_1':'topic_20'] = df_excerpt.loc[:,'topic_1':'topic_20'].apply(np.log)



In [16]:
df_excerpt

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,-8.380227,-8.380227,-3.089533,-8.380227,-8.380227,-8.380227,-8.380227,-8.380227,-1.237702,-3.880644,...,-1.90172,-1.676971,-8.380227,-2.153999,-2.311917,6.895683,3.951244,2.944439,0.311135,0.820178


In [17]:
# load the corpus vector from train
corpus = pd.read_csv('../data/final_train.csv')
corpus = corpus.drop(columns ='Unnamed: 0')
corpus.head()

Unnamed: 0,book_title,author_name,book_location,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_16,topic_17,topic_18,topic_19,topic_20,word_count,sentence_count,sentence_length,polarity,subjectivity
0,Behind the Beyond,Stephen Leacock,../data/gutenberg/Stephen Leacock___Behind the...,-12.002609,-12.002609,-12.002609,-2.871504,-3.153252,-12.002609,-1.804827,...,-2.601882,-12.002609,-2.533365,-12.002609,-1.715871,10.312314,7.71913,2.593184,0.463344,0.43898
1,Tommy and Co,Jerome Klapka Jerome,../data/gutenberg/Jerome Klapka Jerome___Tommy...,-12.689609,-3.822413,-12.689609,-12.689609,-12.689609,-12.689609,-3.369898,...,-2.813341,-1.759709,-12.689609,-12.689609,-2.188462,10.936636,8.395477,2.541159,0.481264,0.534175
2,Winsome Winnie and other New Nonsense Novels,Stephen Leacock,../data/gutenberg/Stephen Leacock___Winsome Wi...,-12.31824,-4.697728,-2.575599,-2.31933,-4.82236,-2.884098,-1.610666,...,-2.252721,-3.056927,-12.31824,-12.31824,-2.031256,10.587266,8.02027,2.566996,0.506857,0.48805
3,The Moccasin Ranch,Hamlin Garland,../data/gutenberg/Hamlin Garland___The Moccasi...,-11.631952,-11.631952,-0.884569,-1.790058,-4.037696,-11.631952,-3.698611,...,-11.631952,-11.631952,-11.631952,-11.631952,-4.415382,9.817221,7.224753,2.592468,0.39793,0.594432
4,Three Ghost Stories,Charles Dickens,../data/gutenberg/Charles Dickens___Three Ghos...,-5.966696,-2.489925,-3.869492,-2.632903,-11.630886,-11.630886,-0.820747,...,-3.382406,-4.625073,-4.409436,-4.862522,-2.233253,9.96086,6.907755,3.053105,0.40481,0.465239


In [18]:
# columns to drop before finding similarity
drop_cols =['word_count','sentence_count']
corpus = corpus.drop(columns =drop_cols)
df_excerpt = df_excerpt.drop(columns = drop_cols)

In [19]:
print(corpus.shape)
print(df_excerpt.shape)

(2428, 26)
(1, 23)


In [20]:
# shape em up to numpy arrays
given_excerpt = np.array(df_excerpt)
search_in = np.array(corpus.iloc[:,3:])

In [21]:
print(given_excerpt.shape)
print(search_in.shape)

(1, 23)
(2428, 23)


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
results = cosine_similarity(search_in, given_excerpt)

In [23]:
show_me = pd.DataFrame(results).sort_values(0, ascending=False).head(1)
show_me

Unnamed: 0,0
1393,0.954045


In [24]:
title = corpus.iloc[list(show_me.index),0:2].values
similar_book = title[0][0] + ' by '+ title[0][1]
similar_book

'The Parasite by Sir Arthur Conan Doyle'

50 shades of gray<br>
Anna Christie	by Eugene O	<br>
Anna Christie is the story of a former prostitute who falls in love, but runs into difficulty in turning her life around.

The Blue Djinn of Babylon<br>
The Parasite	Sir Arthur Conan Doyle	<br>
The Parasite makes use of a form of mind control similar to the mesmerism of the Victorian era; it works on some hosts but not others.
