# Imports

In [1]:
#imports basics
import pandas as pd
import numpy as np
from tqdm import tqdm

#loading data
from surprise import Dataset, Reader

#imports for training
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate, KFold
from surprise.model_selection.search import GridSearchCV

#models
from surprise import SVD
from surprise import NMF
from surprise import KNNBasic
from surprise import KNNWithMeans

#metrics
from surprise import accuracy

# Read Preprocessed Data

In [2]:
#load data
df =  pd.read_csv('../data/processed/preprocessed-data.csv')

In [3]:
#perform train test split
reader = Reader(rating_scale=(1, 10))
trainset, testset = train_test_split(df[['user_id','isbn','rating']], test_size=0.2, random_state=42, shuffle=True)
trainset.index = range(len(trainset))
testset.index = range(len(testset))
trainset = Dataset.load_from_df(trainset, reader)
testset = testset.values.tolist()

In [11]:
print(df.shape[0])
print(df.groupby(['user_id','isbn']).publisher.sum())

89905
user_id  isbn      
114      0446608653              Warner Books
         0446612545              Warner Books
         0446612618              Warner Books
         0451208080               Signet Book
         0553584383              Bantam Books
                                ...          
278843   0553274503                    Bantam
         059035342X    Arthur A. Levine Books
         0679412956           Alfred A. Knopf
         0767902890                  Broadway
         0786881852                  Hyperion
Name: publisher, Length: 89905, dtype: object


# Define Models

In [27]:
# Define the models and their hyperparameters
algorithms = []

algorithms.append(
    {
        "description": 'SVD()',
        "model": SVD,
        "param_grid": {
            'n_factors': [100,150,275],
            'n_epochs': [10, 25, 50],
            'lr_all': [0.002, 0.005, 0.01],
            'reg_all': [0.02, 0.05, 0.13],
            'random_state': [42]
        }
    }
)

algorithms.append(
    {
        "description": 'kNN-basic',
        "model": KNNBasic,
        "param_grid": {
            'k': [2, 3, 5, 7],
            'sim_options': {
                'name': ['msd', 'cosine', 'pearson'],
                'min_support': range(1,3),
                'user_based': [False, True]}
        }
    }
)
algorithms.append(
    {
        "description": 'kNN-with-Means',
        "model": KNNWithMeans,
        "param_grid": {
            'k': [2, 3, 5,7],
            'sim_options': {
                'name': ['msd', 'cosine', 'pearson'],
                'min_support': range(1,3),
                'user_based': [False, True]}
        }
    }
)


k_fold = KFold(
    n_splits=4,
    random_state=42,
    shuffle=True
)

# Run Cross-Validation with Different Models

In [28]:
result_list = []

In [30]:
print(parameter_grid)


{'n_factors': [100, 150, 275], 'n_epochs': [10, 25, 50], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.05, 0.13], 'random_state': 42}


In [31]:
algo = algorithms[0]['model']
print(type(algo))
parameter_grid = algorithms[0]['param_grid']
print(parameter_grid)
gs = GridSearchCV(
    algo,
    param_grid=parameter_grid,
    measures=['rmse', 'mae'],
    cv=k_fold
)
gs.fit(trainset)

result_list.append({
    "algorithm":        algorithms[0]['description'],
    "rmse_best_score":  gs.best_score['rmse'],
    "rmse_best_params": gs.best_params['rmse'],
    "mae_best_score":   gs.best_score['mae'],
    "rmse_best_params": gs.best_params['mae'],
})
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
result_list.append(gs.cv_results)

<class 'type'>
{'n_factors': [100, 150, 275], 'n_epochs': [10, 25, 50], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.05, 0.13], 'random_state': [42]}
1.5610370527042674
{'n_factors': 100, 'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.13, 'random_state': 42}


In [32]:
gs = GridSearchCV(
    algorithms[1]['model'],
    param_grid=algorithms[1]['param_grid'],
    measures=['rmse', 'mae'],
    cv=k_fold
)
gs.fit(trainset)

result_list.append({
    "algorithm":        algorithms[1]['description'],
    "rmse_best_score":  gs.best_score['rmse'],
    "rmse_best_params": gs.best_params['rmse'],
    "mae_best_score":   gs.best_score['mae'],
    "rmse_best_params": gs.best_params['mae'],
})
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
result_list.append(gs.cv_results)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [33]:
gs = GridSearchCV(
    algorithms[2]['model'],
    param_grid=algorithms[2]['param_grid'],
    measures=['rmse', 'mae'],
    cv=k_fold
)
gs.fit(trainset)

result_list.append({
    "algorithm":        algorithms[2]['description'],
    "rmse_best_score":  gs.best_score['rmse'],
    "rmse_best_params": gs.best_params['rmse'],
    "mae_best_score":   gs.best_score['mae'],
    "rmse_best_params": gs.best_params['mae'],
})
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
result_list.append(gs.cv_results)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [38]:
result_list[4]

{'algorithm': 'kNN-with-Means',
 'rmse_best_score': 1.7422495762183148,
 'rmse_best_params': {'k': 7,
  'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': True}},
 'mae_best_score': 1.297301479670614}

In [None]:
result_list = []

i = 0

for algorithm in algorithms:
    # Perform cross validation

    gs = GridSearchCV(
        algorithm['model'],
        param_grid=algorithm['param_grid'],
        measures=['rmse', 'mae'],
        cv=k_fold,
        n_jobs=-1
    )
    gs.fit(trainset)

    result_list.append({
        "algorithm":        algorithm['description'],
        "rmse_best_score":  gs.best_score['rmse'],
        "rmse_best_params": gs.best_params['rmse'],
        "mae_best_score":   gs.best_score['mae'],
        "rmse_best_params": gs.best_params['mae'],
    })
    # best RMSE score
    print(gs.best_score['rmse'])
    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    result_list.append(gs.cv_results)


    #results = cross_validate(algorithm['model'], trainset, measures=['RMSE','MAE'], cv=5, verbose=False)

    # Get results & append algorithm name
    #tmp = pd.DataFrame.from_dict(results)
    #tmp = tmp.append(pd.Series([algorithm['description']], index=['Algorithm']))
    #benchmark.append(tmp)
    #i+=1

#benchmark_df = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

1.5610370527042674
{'n_factors': 100, 'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.13, 'random_state': 42}


print the results

In [7]:
#train the best model from the benchmark (SVD(n_factors=50)) on the full trainset and evaluate on the testset
#
trainset = trainset.build_full_trainset()
algorithm = SVD(n_factors=50)
model_svd = algorithm.fit(trainset)
predictions = model_svd.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 1.6196
1.6196192492983177
MAE:  1.2523
1.2523249502799758


# Content Based Approach

In [54]:
#load data
df =  pd.read_csv('../data/processed/preprocessed-data.csv')

'remove_trash' is a function that gets passed a list of substrings and a summary text. It replaces all occurences of each substring in the list by ''.
In the case of the summaries it is text that is polluted with characters that were not properly decoded. Here is an example

Also there are some books where the summary contains only the string '9', which indicates a default value for missing summaries. We drop those books.

In [73]:
df.Summary[32223]

'Praise for The Diary of a Young Girl &quot;A truly remarkable book.'

In [74]:
def remove_trash(text, trash_list):
    for trash in trash_list:
        text = text.replace(trash, ' ')
    return text

In [75]:
df['Summary'] = df['Summary'].map(lambda x: remove_trash(x,['\n', '&quot;','&#39;']))
ratings_with_summary = df[ (df['Summary']!= '9')]


grouping by isbn to get each summary, book title, and isbn only once


In [76]:
books_with_summary = ratings_with_summary.groupby('isbn').apply(lambda df: df.iloc[0]).loc[:,['isbn', 'book_title', 'Summary']]

In [77]:
books_with_summary.loc[:,['isbn', 'book_title','Summary']]

Unnamed: 0_level_0,isbn,book_title,Summary
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0007110928,0007110928,Billy,"Biography of Billy Connolly, written by his wi..."
0020442203,0020442203,"Lion, the Witch and the Wardrobe","The best-selling rack edition of The Lion, the..."
002542730X,002542730X,Politically Correct Bedtime Stories: Modern Ta...,A whimsical adaptation of classic fairy tales ...
0060008776,0060008776,Pagan Babies,Father Terry Dunn thought he d seen everything...
006001203X,006001203X,Dr. Atkins' New Diet Revolution,With Dr. Atkins New Diet Revolution you get e...
...,...,...,...
3548359698,3548359698,Warum MÃ?ÃÂ¤nner nicht zuhÃ?ÃÂ¶ren und Fra...,Die Autoren werfen einen amÃ¼santen Blick auf ...
3596150655,3596150655,Generation Golf. Eine Inspektion,"Ein aus Erinnerungen, Analysen und Impressione..."
8445071416,8445071416,El Hobbit,Sacado de su comodo agujero-hobbit por Gandalf...
8807813025,8807813025,"Novocento, Un Monologo","Il libro racchiude la storia, raccontata dall ..."


In [85]:
books_with_summary.groupby('Summary').count().sort_values(by='isbn',ascending=False)
books_with_summary.groupby('Summary').filter(lambda df: len(df.index) > 1 )

Unnamed: 0_level_0,isbn,book_title,Summary
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0060512806,0060512806,Cryptonomicon,With this extraordinary first volume in what p...
0061015725,0061015725,Prey,"Deep in the remote Nevada desert, eight people..."
0064407667,0064407667,The Bad Beginning (A Series of Unfortunate Eve...,"In the first two books alone, the three youngs..."
0064407675,0064407675,The Reptile Room (A Series of Unfortunate Even...,"In the first two books alone, the three youngs..."
0064472795,0064472795,Princess in the Spotlight (The Princess Diarie...,She s just a New York City girl living with he...
...,...,...,...
087113795X,087113795X,Peace Like a River,Eleven-year-old Reuben shares the story of how...
0877017883,0877017883,Griffin & Sabine: An Extraordinary Correspondence,"Don t miss The Pharos Gate, the final volume i..."
1573225517,1573225517,High Fidelity,Follows the love affairs and belated growing u...
1573228214,1573228214,High Fidelity,Follows the love affairs and belated growing u...


In [95]:
text_corpus = [text for text in books_with_summary.Summary.unique()]

['Biography of Billy Connolly, written by his wife Pamela Stephenson.',
 'The best-selling rack edition of The Lion, the Witch and the Wardrobe now has a movie still cover and an eight-page movie still insert!  Excellent for Homeschool Use ',
 'A whimsical adaptation of classic fairy tales and bedtime stories removes all kinds of bias and objective language from such traditional tales as  Chicken Little,   Rapunzel,   The Three Little Pigs,   Cinderella,  and many others.',
 'Father Terry Dunn thought he d seen everything on the mean streets of Detroit, but that was before he went on a little retreat to Rwanda to evade a tax-fraud indictment.',
 'With Dr. Atkins  New Diet Revolution you get everything you need: essential medical and nutritional information, a helpful carbohydrate gram counter, low-carb meal plans and dozens of mouth-watering recipes for everything from appetizers to ...',
 'February the fifteenth is a very special day for me.',
 'As this spellbinding novel unfolds, it 

In [13]:
len(text_corpus)

1363

In [14]:
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models.tfidfmodel import TfidfModel
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string
from gensim.corpora import Dictionary


In [52]:
text_corpus[1000]

'Caterer and amateur sleuth Goldy B. Schulz gets caught between a ruthless cosmetic company and a violent animal rights group Featuring Original Recipes for Such Luscious (and Lowfat) Dishes as Fettuccine Alfredo with Asparagus, Decadent ...'

In [16]:
#gensim preprocess_documents() function applies basic text preprocessing, like removing stop words, punctuation, stemming etc.
#it gets a list of strings. Each string in the list is the summary for one book
text_corpus_preprocessed = preprocess_documents(text_corpus)
dct = Dictionary(text_corpus_preprocessed) #this only build a mapping of an integer to each word in the corpus
corpus = [dct.doc2bow(summary_of_book) for summary_of_book in text_corpus_preprocessed]
#than the doc2bow() function is applied to replace the summary which is a list of word to a list of tuples like (int, int). the first integers refers to the words key in the dct and the second integer how often that word occured in that particular summary
#e.g. dct.doc2bow(['happy','summer', 'day', 'summer']) >> [(234,1),(65,2),(23,1)]
tfidf = TfidfModel(corpus)

[['suicid',
  'billionair',
  'burnt',
  'washington',
  'litig',
  'woman',
  'forsaken',
  'technolog',
  'work',
  'wild',
  'brazil',
  'brought',
  'astound',
  'mysteri',
  'testament'],
 ['stare',
  'unflinchingli',
  'abyss',
  'slaveri',
  'novel',
  'transform',
  'histori',
  'stori',
  'power',
  'exodu',
  'intim',
  'lullabi'],
 ['wild',
  'animu',
  'search',
  'primordi',
  'test',
  'human',
  'foundat',
  'journei',
  'break',
  'point'],
 ['fatal',
  'mid',
  'air',
  'collis',
  'involv',
  'commerci',
  'airlin',
  'prompt',
  'frantic',
  'desper',
  'investig',
  'caus',
  'accid',
  'thriller',
  'explor',
  'issu',
  'safeti',
  'secur',
  'aircraft',
  'industri'],
 ['quantum',
  'time',
  'machin',
  'group',
  'young',
  'historian',
  'sent',
  'year',
  'rescu',
  'trap',
  'project',
  'leader'],
 ['unforgett',
  'novel',
  'childhood',
  'sleepi',
  'southern',
  'town',
  'crisi',
  'conscienc',
  'rock',
  'kill',
  'mockingbird',
  'instant',
  'bests

In [17]:
dct[2]

'brazil'

In [18]:
len(dct)

5979

In [19]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(12, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1)],
 [(36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1)],
 [(56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1)],
 [(20, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1)],
 [(24, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1)],
 [(

In [20]:
def compute_similarity(text, sim_matrix):
    preprocessed_text = preprocess_string(text)
    encoded_text = dct.doc2bow(preprocessed_text)
    similarities = sim_matrix[encoded_text]
    sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

    for i in range(5):
        print(sorted_similarities[i])
        print(f'text: {text_corpus[sorted_similarities[i][0]]}')
        print('---------------')
    print(f'original text: {text}\n\n')

In [21]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=5979)

In [53]:
compute_similarity(text_corpus[1000],index)

(1000, 0.979887)
text: Caterer and amateur sleuth Goldy B. Schulz gets caught between a ruthless cosmetic company and a violent animal rights group Featuring Original Recipes for Such Luscious (and Lowfat) Dishes as Fettuccine Alfredo with Asparagus, Decadent ...
---------------
(915, 0.26370758)
text: Caterer and occasional sleuth Goldy B. Schulz sets off on the trail of the killer of her ex-husband s girlfriend As much as Goldy would like to see her ex get his just desserts, could he really be a killer?
---------------
(429, 0.1774226)
text: A Golden Opportunity For caterer Goldy Schulz, who s been worrying about her cash flow, the chance to work for a high-powered company like Prospect Financial Partners seems like a dream come true.
---------------
(1077, 0.1679584)
text: Caterer Goldy Bear eagerly prepares her wedding feast, until word comes from the groom that the wedding is off because of a killer Featuring Original Recipes for Such Heavenly Delights as Monster Cinnamon Rolls, S

In [88]:
%pip install -U sentence-transformers



Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 1.1 MB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 1.3 MB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 1.3 MB/s eta 0:00:01
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.4 MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-macosx_10_11_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 706 kB/s eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... 

In [90]:
%pip install sentence_transformer
#Write some lines to encode (sentences 0 and 2 are both ideltical):
sen = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell.",
    "Jupyter is a god the ancients greeks used to believe in",
    "The planet jupyter is one of the biggest in our solar system",
    "Venus is also part of the old religions"
]
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
#Encoding:
sen_embeddings = model.encode(sen)
sen_embeddings.shape

[31mERROR: Could not find a version that satisfies the requirement sentence_transformer (from versions: none)[0m
[31mERROR: No matching distribution found for sentence_transformer[0m
Note: you may need to restart the kernel to use updated packages.


Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

(4, 768)

In [97]:
sen = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell.",
    "Jupyter is a god the ancients greeks used to believe in",
    "The planet jupyter is one of the biggest in our solar system",
    "Venus is also part of the old religions"
]
sen_embeddings = model.encode(text_corpus)
sen_embeddings.shape

(1363, 768)

In [130]:
from sklearn.metrics.pairwise import cosine_similarity
#let's calculate cosine similarity for sentence 0:
outcome = cosine_similarity(
    [sen_embeddings[654]],
    sen_embeddings[0:]
)

In [131]:
outcome

array([[0.29222   , 0.4056784 , 0.35955143, ..., 0.30101427, 0.26883286,
        0.5146457 ]], dtype=float32)

In [132]:
for i,j in enumerate(sorted(outcome[0])[-10:]):
    print(f'Similarity of {j}: {text_corpus[i]}\n')


Similarity of 0.7007507085800171: Biography of Billy Connolly, written by his wife Pamela Stephenson.

Similarity of 0.7062540054321289: The best-selling rack edition of The Lion, the Witch and the Wardrobe now has a movie still cover and an eight-page movie still insert!  Excellent for Homeschool Use 

Similarity of 0.7065611481666565: A whimsical adaptation of classic fairy tales and bedtime stories removes all kinds of bias and objective language from such traditional tales as  Chicken Little,   Rapunzel,   The Three Little Pigs,   Cinderella,  and many others.

Similarity of 0.7069470882415771: Father Terry Dunn thought he d seen everything on the mean streets of Detroit, but that was before he went on a little retreat to Rwanda to evade a tax-fraud indictment.

Similarity of 0.7101007103919983: With Dr. Atkins  New Diet Revolution you get everything you need: essential medical and nutritional information, a helpful carbohydrate gram counter, low-carb meal plans and dozens of mouth

In [134]:
text_corpus[1000]

'Top forensic sculptor Eve Duncan is forced into a deadly game of terror with a twisted murderer who uses the mysterious circumstances surrounding her daughter s death as a way to lure her to him.'