# Initialize

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [4]:
dfJokes = pd.read_csv('JokeText.csv')
dfJokes.dropna()
dfJokes.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [7]:
dfReviews = pd.read_csv('UserRatings1.csv')
dfReviews.dropna()
dfReviews.head()

Unnamed: 0,JokeId,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User36701,User36702,User36703,User36704,User36705,User36706,User36707,User36708,User36709,User36710
0,0,5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,-3.64,...,,,,,,,,,2.91,
1,1,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,-3.35,...,,,,-5.63,,-6.07,,-1.6,-4.56,
2,2,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,-6.46,...,,,,,,4.08,,,8.98,
3,3,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,-3.4,...,,,,,,,,,,
4,4,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,1.26,...,2.28,-0.49,5.1,-0.29,-3.54,-1.36,7.48,-5.78,0.73,2.62


# 1. Content based filtering

In [8]:
dfJokes.shape

(100, 2)

In [11]:
dfJokes.drop_duplicates(subset ='JokeText', keep = 'first', inplace = True)
dfJokes.shape

(100, 2)

Build model

In [13]:
# Generate a matrix of common terms that show up in each joke

from sklearn.feature_extraction.text import TfidfVectorizer
mdlJokes = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = mdlJokes.fit_transform(dfJokes['JokeText'])
tfidf_matrix.shape

(100, 3774)

In [14]:
# Calculate cosine similarity between each pair of jokes as a function of the similarity of the common terms

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(100, 100)

# Predict

In [26]:
# Prepare recommendation function (build code from scratch and then package as function for ease of understanding)

jokes = dfJokes['JokeText']
indices = pd.Series(dfJokes.index, index=dfJokes['JokeText'])

def get_recommendations(joke):
    idx = indices[joke]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    joke_indices = [i[0] for i in sim_scores]
    return jokes.iloc[joke_indices]

In [27]:
get_recommendations("A man visits the doctor. The doctor says ""I have bad news for you.You have cancer and Alzheimer's disease"". The man replies ""Well,thank God I don't have cancer!""").head(10)

KeyError: "A man visits the doctor. The doctor says I have bad news for you.You have cancer and Alzheimer's disease. The man replies Well,thank God I don't have cancer!"