**Global imports**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.spatial.distance import cosine, correlation

pd.options.display.float_format='{:,.3f}'.format

**Import from Surprise**

In [2]:
from surprise import Dataset, Reader
from surprise import SVD, KNNWithMeans, SlopeOne
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:
import re
import nltk      
nltk.download()  ### This will let you download various NLTK packages; it's only necessary to do this once

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Part 1. [50 pts] Content-Based Joke Recommendation

Read in the joke ratings data as well as joke text data into appropriate data structures. First, preprocess the joke text data (tokenization, stop word removal, stemming) and create feature vector representation of jokes with TFxIDF weights. Note that for some of the preprocessing steps, you may need to install NLTK package if not already installed with your Anaconda distribution. For the vectorization, you can use functions from Scikit-learn's feature_extraction.text module. Show the feature set or the vocabulary for the item vectors.

In [4]:
joke_ratings = pd.read_csv("joke-ratings.csv", header=None)
joke_ratings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,3,19,1,2,3,2,1,15,2,6,...,13,0,0,0,0,0,5,0,0,0
1,15,10,17,15,8,1,10,5,19,20,...,13,6,10,18,10,8,14,11,6,12
2,0,0,0,0,20,20,20,20,0,0,...,0,0,0,20,0,0,0,0,0,0
3,0,19,0,0,12,19,8,17,0,12,...,0,0,0,11,0,0,0,0,0,0
4,19,15,6,5,12,12,18,15,10,16,...,16,16,15,16,16,12,14,17,12,12


In [5]:
joke_text = pd.read_csv("jokes.csv",header=None, dtype=str, index_col=0, names=["Joke Text"])
joke_text.head()

Unnamed: 0,Joke Text
0,"A man visits the doctor. The doctor says ""I ha..."
1,This couple had an excellent relationship goin...
2,Q. What's 200 feet long and has 4 teeth? A. Th...
3,Q. What's the difference between a man and a t...
4,Q. What's O. J. Simpson's Internet address? A....


In [6]:
items = {}
users = {}
iid = 0
uid = 0
for i in joke_ratings.columns:
    items[i] = iid
    iid += 1
for u in joke_ratings.index:
    users[u] = uid
    uid += 1

In [7]:
ratings = pd.DataFrame([],columns=['user_id', 'joke_id', 'rating'], dtype=int)

for u in joke_ratings.index:
    for i in joke_ratings.columns:
        if joke_ratings[i][u] > 0:
            new_rating = {"user_id":users[u], "joke_id":items[i], "rating":joke_ratings[i][u]}
            ratings = ratings.append(new_rating, ignore_index=True)

ratings.sample(10)

Unnamed: 0,user_id,joke_id,rating
13422,192,85,7
69632,985,2,20
52880,742,81,8
48226,681,10,19
46628,657,55,14
36805,513,24,1
8248,114,7,16
61760,872,41,12
70673,999,76,14
1214,15,50,14


In [8]:
#First, preprocess the joke text data (tokenization, stop word removal, stemming) 
#and create feature vector representation of jokes with TFxIDF weights.

#code from class notebook
def normalize_document(doc):
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    ps = nltk.stem.PorterStemmer()
    # convert to lower case, and remove special characters and white space
    doc = re.sub(r'[^a-zA-Z0-9_\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize the document
    tokens = wpt.tokenize(doc)
    # remove stopwords
    filtered_tokens = [token for token in tokens if (token not in stop_words and token not in ["."])]
    # put the filtered document back together
    doc = ' '.join([ps.stem(token) for token in filtered_tokens])
    return doc

In [9]:
vectorzer = TfidfVectorizer(preprocessor=normalize_document, norm=None, max_df = 0.8, min_df = 3)

In [10]:
vectorzer.fit(joke_text['Joke Text'])

TfidfVectorizer(max_df=0.8, min_df=3, norm=None,
                preprocessor=<function normalize_document at 0x000001F24D57BB70>)

In [11]:
joke_mat = vectorzer.transform(joke_text['Joke Text'])

In [12]:
joke_mat

<100x186 sparse matrix of type '<class 'numpy.float64'>'
	with 1005 stored elements in Compressed Sparse Row format>

In [13]:
features = vectorzer.get_feature_names()
print(features[100:120])

['morn', 'mother', 'much', 'must', 'need', 'never', 'new', 'news', 'next', 'notic', 'oh', 'okay', 'old', 'one', 'open', 'order', 'paid', 'pass', 'person', 'phone']


In [14]:
vectorzer.vocabulary_

{'man': 91,
 'doctor': 38,
 'say': 137,
 'bad': 11,
 'news': 107,
 'repli': 130,
 'well': 173,
 'coupl': 32,
 'go': 60,
 'one': 113,
 'day': 33,
 'came': 23,
 'home': 73,
 'work': 181,
 'find': 53,
 'ask': 8,
 'told': 163,
 'thing': 157,
 'could': 30,
 'possibl': 122,
 'said': 136,
 'make': 90,
 'big': 16,
 'word': 180,
 'year': 184,
 'old': 112,
 'what': 175,
 'feet': 51,
 'long': 86,
 'differ': 37,
 'around': 6,
 'use': 169,
 'bill': 17,
 'hillari': 72,
 'back': 10,
 'pull': 125,
 'run': 135,
 'school': 138,
 'car': 24,
 'window': 178,
 'good': 62,
 'turn': 166,
 'marri': 93,
 'wife': 177,
 'would': 182,
 'presid': 123,
 'mani': 92,
 'take': 154,
 'screw': 139,
 'light': 82,
 'funni': 56,
 'hear': 70,
 'countri': 31,
 'guy': 67,
 'goe': 61,
 'bar': 12,
 'demand': 35,
 'got': 63,
 'like': 83,
 'return': 133,
 'okay': 111,
 'resourc': 131,
 'come': 27,
 'start': 149,
 'anyth': 5,
 'two': 167,
 'eat': 42,
 'someon': 146,
 'seem': 142,
 'look': 87,
 'measur': 94,
 'hell': 71,
 'answer': 

 -----
 Create a non-personalized recommender that, given a target/query item, returns the top k most similar items to the target item. Your function/program should use as input a pre-computed item-item similarity matrix, with Cosine similarity as metric, based on feature representations of items. Your recommender should output the texts of the query joke and the top k recommended jokes along with the similarity values to the query joke. Show the output of your recommender on jokes 19 and 46.

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

sim_mat = cosine_similarity(joke_mat)
sim_mat.shape

(100, 100)

In [16]:
np.set_printoptions(linewidth=120, precision=2, edgeitems=10)

In [17]:
#print(sim_mat)

In [18]:
def content_based_sim(dataMat, simMatrix, item, k):
    sims = simMatrix[item,:]
    idx = np.argsort(sims)
    idx = idx[::-1]
    ### Need to remove the item itself since it has the highest similarity to itself
    idx = np.array([i for i in idx if i != item])
    neigh_idx = idx[:k]
    neigh_sims = sims[neigh_idx]
    return neigh_idx, neigh_sims

In [19]:
joke = 19

In [20]:
def get_joke_text(jokes, id):
    return jokes.iloc[id][0]

In [21]:
get_joke_text(joke_text,3)

"Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it."

In [31]:
joke = 19
joke_text.iloc[19]['Joke Text']

"What's the difference between a MacIntosh and anEtch-A-Sketch? You don't have to shake the Mac to clear the screen. "

In [23]:
nidx, nsims = content_based_sim(joke_mat, sim_mat, joke, 10)

In [24]:
print(nidx)

[77  4 29 99  3 20  2 72 74 38]


In [25]:
print(nsims)

[0.91 0.75 0.66 0.65 0.61 0.47 0.38 0.37 0.33 0.13]


In [26]:
#finding top 10 similar jokes and their similarity
k=10

nidx, nsims =content_based_sim(joke_mat, sim_mat, joke, k)
print('Top {} similar jokes for joke {}:\n'.format(k, joke))
for n,s in zip(nidx,nsims):
    print('Joke # {}, similarity rating: {}'.format(n, s))
    print(get_joke_text(joke_text, n),"\n")

Top 10 similar jokes for joke 19:

Joke # 77, similarity rating: 0.9053038607511614
Q: What's the difference between the government  and  the Mafia? A: One of them is organized. 

Joke # 4, similarity rating: 0.746102659945633
Q. What's O. J. Simpson's Internet address? A.	Slash slash backslash slash slash escape. 

Joke # 29, similarity rating: 0.6566652022371093
Q: What's the difference between a Lawyer and a Plumber? A: A Plumber works to unclog the system. 

Joke # 99, similarity rating: 0.6538919150447557
Q: What's the difference between greeting a Queen and greeting thePresident of the United  States?A: You only have to get on one knee to greet the queen. 

Joke # 3, similarity rating: 0.6053304895090087
Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it. 

Joke # 20, similarity rating: 0.46764791494117813
What's the difference between a used tire and 365 used condoms?One's a Goodyear the other's a great year. 

Joke # 2, s

In [27]:
#second joke
get_joke_text(joke_text,46)

'There was an engineer who had an exceptional gift for fixing all things mechanical.  After serving his company loyally for over 30 years he happily retired.  Several years later the company contacted him regarding a seemingly impossible problem they were having with one of their multi-million dollar machines. They had tried everything and everyone else to get the machine fixed but to no avail.  In desperation they called on the retired engineer who had solved so many of their problems in the past.The engineer reluctantly took the challenge.  He spent a day studying the huge machine.  At the end of the day he marked a small "x" in chalk on a particular component of the machine and proudly stated "This is where your problem is".The part was replaced and the machine worked perfectly again. The company received a bill for $50000 from the engineer for his service.They demanded an itemized accounting of his charges. The engineer responded briefly:One chalk mark                       $1 Know

In [28]:
joke=46
nidx, nsims = content_based_sim(joke_mat, sim_mat, joke, 10)
print(nidx)
print(nsims)

[83 92 27 48 61 89 70 47 62 54]
[0.5  0.49 0.47 0.43 0.28 0.24 0.23 0.2  0.15 0.15]


In [29]:
#finding top 10 similar jokes and their similarity
nidx, nsims =content_based_sim(joke_mat, sim_mat, joke, k)
print('Top {} similar jokes for joke {}:\n'.format(k, joke))
for n,s in zip(nidx,nsims):
    print('Joke # {}, similarity rating: {}'.format(n, s))
    print(get_joke_text(joke_text, n),"\n")

Top 10 similar jokes for joke 46:

Joke # 83, similarity rating: 0.49910014923215623
Q: What is the difference between Mechanical Engineers and Civil Engineers? A: Mechanical Engineers build weapons Civil Engineers build targets. 

Joke # 92, similarity rating: 0.48758008243753925
Reaching the end of a job interview the human resources person asked a young engineer fresh out of Stanford"And what starting salary were you looking for?"The engineer said "In the neighborhood of $125000 a year depending on the benefits package."The interviewer said "Well what would you say to a package of 5-weeks vacation 14 paid holidays full medical and dental company matching retirement fund to 50% of salary and a company car leased every 2 years - say a red Corvette?"The Engineer sat up straight and said "Wow! Are you kidding?"And the interviewer replied "Yeah but you started it." 

Joke # 27, similarity rating: 0.46970973368996066
A mechanical electrical and a software engineer from Microsoft were driv

________
Next, for a personalized version of the content-based recommendation, you'll need to create user profiles for each user based on their past ratings on jokes. Use the following procedure for creating user profiles.

We are going to assume that "liked" items by a user u are those jokes whose rating by u is greater than u's mean rating on all jokes rated by u (note that zeros don't count as ratings). These are the jokes from which user profiles for each user will be constructed. So, for each user you need to collect all jokes that satisfy this condition.

In [32]:
#get mean rating for each user
ratings['avg_rating'] = ratings.groupby('user_id')['rating'].transform('mean')
ratings.sample(10)

Unnamed: 0,user_id,joke_id,rating,avg_rating
11979,167,46,14,16.608
19419,275,43,11,9.99
67815,956,40,11,10.789
60793,856,48,15,13.928
43237,610,61,15,12.18
68423,966,27,11,12.343
28309,394,11,8,7.34
52656,739,35,11,8.851
57451,807,52,14,12.216
57176,803,52,3,7.972


In [33]:
#if rating is greater than user's avg rating then "like"
def myFunc(x,y):
    if x < y:
        return 'liked'
    else:
        return 'not liked'

In [34]:
ratings['liked'] = ratings.apply(lambda x: myFunc(x.avg_rating, x.rating), axis=1)

In [35]:
ratings.tail()

Unnamed: 0,user_id,joke_id,rating,avg_rating,liked
70670,999,68,15,12.507,liked
70671,999,69,11,12.507,not liked
70672,999,74,8,12.507,not liked
70673,999,76,14,12.507,liked
70674,999,94,12,12.507,not liked


Split the liked items for each user into a training and a test portions. Normally, we would use random splits, however, for consistency across different answers, in this case you should select the first half of items as the user profile to be used for training, and the second half to be set aside for testing purposes. For each user, you should maintain a list of indices of the items in the user profile (the known "liked" items used for training) as well as indices of items in the test portion of user's record (the withheld test "liked" items).

In [36]:
liked_jokes = ratings.loc[ratings['liked'] == 'liked']
liked_jokes.head()

Unnamed: 0,user_id,joke_id,rating,avg_rating,liked
1,0,1,19,7.014,liked
7,0,7,15,7.014,liked
13,0,13,19,7.014,liked
21,0,21,12,7.014,liked
25,0,25,12,7.014,liked


In [None]:
j_text = pd.read_csv("jokes.csv",header=None, dtype=str, index_col=0, names=["Joke Text"])
j_text.reset_index(level=0, inplace=True)
j_text.rename(columns={'index':'joke_id'},inplace=True)
j_text['joke_id']=j_text['joke_id'].astype(int)

In [46]:
joke_text.reset_index(level=0, inplace=True)
joke_text.rename(columns={'index':'joke_id'},inplace=True)
joke_text['joke_id']=joke_text['joke_id'].astype(int)
#joke_text['Joke Text']=joke_text['Joke Text'].astype(str)

In [47]:
ui_merge = pd.merge(liked_jokes, joke_text)
ui_merge.head()

Unnamed: 0,user_id,joke_id,rating,avg_rating,liked,Joke Text
0,0,1,19,7.014,liked,This couple had an excellent relationship goin...
1,3,1,19,13.125,liked,This couple had an excellent relationship goin...
2,4,1,15,13.747,liked,This couple had an excellent relationship goin...
3,5,1,7,5.64,liked,This couple had an excellent relationship goin...
4,7,1,14,8.1,liked,This couple had an excellent relationship goin...


In [72]:
ui_merge['Joke Text']=ui_merge['Joke Text'].astype(str)

In [48]:
ui_merge.drop(columns=['avg_rating', 'liked'], inplace=True)

In [73]:
uid=0
p=ui_merge[ui_merge['user_id']==uid]
p

Unnamed: 0,user_id,joke_id,rating,Joke Text
0,0,1,19,This couple had an excellent relationship goin...
430,0,7,15,Q. Did you hear about the dyslexic devil worsh...
829,0,13,19,The father was very anxious to marry off his o...
1314,0,21,12,A duck walks into a pharmacy and asks for a co...
1755,0,25,12,A guy walks into a bar and sits down next to a...
2334,0,26,18,Clinton returns from a vacation in Arkansas an...
3081,0,27,13,A mechanical electrical and a software enginee...
3636,0,28,20,An old Scotsmen is sitting with a younger Scot...
4404,0,30,13,President Clinton looks up from his desk in t...
5088,0,33,18,Out in the backwoods of some midwestern state ...


In [161]:
train_merge = ui_merge[:19266]
test_merge = ui_merge[19266:]

In [143]:
user_texts = pd.DataFrame([], columns = ['user_id', 'Joke Text'])
user_content = {}
for pid in set(ui_merge['user_id']):
    p = ui_merge[ui_merge['user_id'] == pid]
    all_text = ' '.join(p['Joke Text'])
    user_content[pid] = [a for a in p['joke_id']]
    row = pd.DataFrame([[pid, all_text]], columns=['user_id', 'Joke Text'])
    user_texts = user_texts.append(row, ignore_index = True)

Unnamed: 0,user_id,joke_id,rating,Joke Text
19261,947,11,14,A guy stood over his tee shot for what seemed ...
19262,948,11,12,A guy stood over his tee shot for what seemed ...
19263,951,11,10,A guy stood over his tee shot for what seemed ...
19264,953,11,13,A guy stood over his tee shot for what seemed ...
19265,956,11,10,A guy stood over his tee shot for what seemed ...


In [None]:
#user_content[1]

In [145]:
user_profiles = vectorzer.transform(user_texts['Joke Text'])

In [146]:
user_profiles

<1000x186 sparse matrix of type '<class 'numpy.float64'>'
	with 166960 stored elements in Compressed Sparse Row format>

In [147]:
print(user_profiles[0:2])

  (0, 185)	4.228826155721369
  (0, 184)	22.520576708300222
  (0, 183)	15.293444190452817
  (0, 182)	36.05114343966443
  (0, 181)	31.30213867053259
  (0, 180)	8.457652311442738
  (0, 179)	7.071357950322847
  (0, 178)	12.686478467164108
  (0, 177)	7.6467220952264086
  (0, 176)	4.228826155721369
  (0, 173)	17.0894796975252
  (0, 172)	15.651069335266294
  (0, 171)	26.16363284165144
  (0, 170)	16.915304622885476
  (0, 169)	10.60703692548427
  (0, 168)	4.005682604407159
  (0, 167)	14.21265897300739
  (0, 166)	7.6467220952264086
  (0, 165)	8.457652311442738
  (0, 164)	15.293444190452817
  (0, 163)	16.022730417628637
  (0, 162)	7.6467220952264086
  (0, 161)	18.781283202319553
  (0, 160)	4.228826155721369
  (0, 159)	8.457652311442738
  :	:
  (1, 27)	10.60703692548427
  (1, 26)	19.11680523806602
  (1, 25)	11.470083142839613
  (1, 24)	14.676841471143785
  (1, 23)	4.005682604407159
  (1, 22)	16.56267711923607
  (1, 21)	25.372956934328215
  (1, 20)	8.457652311442738
  (1, 19)	8.457652311442738
  (1

In [148]:
def content_based_recommend(dataMat, user, rated_items_indicies, k):
    from sklearn.metrics.pairwise import cosine_similarity
    sims = cosine_similarity(dataMat, user)
    sims = sims.flatten() # cosine_similarity returns a nx1 array; flatten it into a 1d numpy array
    idx = np.argsort(sims)
    # Make sure we don't add items that are already rated by the user
    idx = np.array([i for i in idx if not (i in rated_items_indicies)])
    idx = idx[::-1] 
    neigh_idx = idx[:k]
    neigh_sims = sims[neigh_idx]
    return neigh_idx, neigh_sims

In [149]:
u = 0
user = user_profiles[u]
pid = user_texts.iloc[u]["user_id"]
rated_items =joke_text[joke_text["joke_id"].isin(user_content[pid])]

In [150]:
rated_items_idx = np.array(rated_items.index)
rated_items_idx

array([ 1,  7, 13, 21, 25, 26, 27, 28, 30, 33, 34, 35, 46, 47, 48, 49, 52, 53, 55, 58, 60, 61, 64, 65, 67, 68, 90],
      dtype=int64)

In [151]:
n_index, n_sims = content_based_recommend(joke_mat, user, rated_items_idx, 10)

In [152]:
n_index

array([92, 45, 83, 93,  5, 37, 80, 75, 62, 36], dtype=int64)

In [157]:
recs = joke_text.iloc[n_index]
recs

Unnamed: 0,joke_id,Joke Text
92,92,Reaching the end of a job interview the human ...
45,45,A couple has been married for 75 years. For th...
83,83,Q: What is the difference between Mechanical E...
93,93,Two atoms are walking down the street when one...
5,5,Bill & Hillary are on a trip back to Arkansas....
37,37,"May I take your order? the waiter asked. ""Yes ..."
80,80,An Asian man goes into a New York CityBank to ...
75,75,There once was a man and a woman that both go...
62,62,An engineer a physicist and a mathematician ar...
36,36,A Jewish young man was seeing a psychiatrist f...
