# Make a function to build an inverted index and a tfidf vectorizer given tokenized text.

In [6]:
from collections import defaultdict
from scipy.sparse import csr_matrix
import json

In [7]:
with open('Jokes.json') as json_data:
    jokes = json.load(json_data)    

In [8]:
# TODOS
# n-grams for tfidf-vectorizer?

In [9]:
def get_inverted_index(list_of_jokes, include_title, include_post):
    data = [(dt['title'] + ' ' + dt['selftext'] if include_post else dt['title']) if include_title else 
            (dt['selftext'] if include_post else '') for dt in list_of_jokes]
    tok_data = [dt.split(' ') for dt in data]
    return build_inverted_index(tok_data)

def build_inverted_index(list_of_toks_lists):
    """ Builds an inverted index from the messages.
    
    Arguments
    =========
     
    msgs: list of dicts.
        Each message in this list already has a 'toks'
        field that contains the tokenized message.
    
    Returns
    =======
    
    index: dict
        For each term, the index contains a list of
        tuples (doc_id, count_of_term_in_doc):
        index[term] = [(d1, tf1), (d2, tf2), ...]
        
    Example
    =======
    
    >> test_idx = build_inverted_index([
    ...    {'toks': ['to', 'be', 'or', 'not', 'to', 'be']},
    ...    {'toks': ['do', 'be', 'do', 'be', 'do']}])
    
    >> test_idx['be']
    [(0, 2), (1, 2)]
    
    >> test_idx['not']
    [(0, 1)]
    
    """
    # term --> tuple
    index = defaultdict(list)
    #for m in msgs:
    for doc_id in range(0, len(list_of_toks_lists)):
        term_to_count = defaultdict(int)
        for tok in list_of_toks_lists[doc_id]:
            term_to_count[tok] += 1
        for t, cnt in term_to_count.iteritems():
            index[t].append((doc_id, cnt))
    return index    

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# input: list of dictionaries, where each dict is a joke.  The dict must have 'text' as a key. 
# include_title and include_text are boolean flags -- important because need to determine whether we want to include 
# just title for the text of the joke, or just the actual post, or both. (obviously at least one of the two needs to be 
# True)
def build_tfidf(list_of_jokes, include_title, include_post, n_feats, min_df = 10, max_df = 0.8):
    tfidf_vec = TfidfVectorizer(input='content', decode_error=u'ignore', strip_accents=u'unicode',
                                analyzer=u'word',max_features=n_feats,stop_words='english',
                                norm=u'l2',min_df=min_df,max_df=max_df,lowercase=True,vocabulary=None)
    data = [(dt['title'] + ' ' + dt['selftext'] if include_post else dt['title']) if include_title else 
            (dt['selftext'] if include_post else '') for dt in list_of_jokes]
    doc_by_vocab_sparse = tfidf_vec.fit_transform(data)
    # doc_by_vocab = doc_by_vocab_sparse.toarray()   # <-- Need?

    # Construct a inverted map from feature index to feature value (word) for later use
    index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}
    
    # return sparse tfidf matrix, and mapping showing the word's index to the word itself
    return (doc_by_vocab_sparse, index_to_vocab)

In [11]:
# Not used yet.
def filter_title_post(list_of_jokes):
    return [(dt['title'] + ' ' + dt['selftext'] if include_post else dt['title']) if include_title else 
            (dt['selftext'] if include_post else '') for dt in list_of_jokes]


In [12]:
#TESTING
j = jokes
# print get_inverted_index(j, False, True)

In [36]:
get_inverted_index(j, True, True)['black']

[(47, 1),
 (64, 1),
 (85, 1),
 (208, 2),
 (260, 1),
 (316, 1),
 (339, 1),
 (400, 1),
 (475, 3),
 (479, 1),
 (504, 1),
 (507, 1),
 (611, 2),
 (647, 1),
 (682, 1),
 (834, 1),
 (881, 1),
 (891, 1),
 (898, 1),
 (901, 1),
 (910, 1),
 (966, 1),
 (971, 1),
 (972, 1),
 (979, 1),
 (981, 1),
 (986, 1),
 (1008, 1),
 (1039, 1),
 (1079, 1),
 (1224, 2),
 (1284, 1),
 (1290, 1),
 (1397, 1),
 (1421, 1),
 (1450, 1),
 (1488, 1),
 (1505, 1),
 (1520, 2),
 (1523, 1),
 (1540, 2),
 (1647, 1),
 (1776, 1),
 (1794, 1),
 (1800, 1),
 (1811, 1),
 (1911, 2),
 (1923, 1),
 (1928, 1),
 (1957, 1),
 (1985, 1),
 (2098, 1),
 (2158, 1),
 (2205, 1),
 (2210, 1),
 (2283, 2),
 (2320, 1),
 (2337, 1),
 (2389, 1),
 (2440, 1),
 (2504, 2),
 (2701, 1),
 (2726, 1),
 (2798, 1),
 (2807, 2),
 (2952, 1),
 (3029, 1),
 (3032, 1),
 (3057, 1),
 (3097, 1),
 (3146, 1),
 (3192, 1),
 (3333, 1),
 (3340, 1),
 (3450, 1),
 (3466, 2),
 (3475, 1),
 (3519, 1),
 (3552, 1),
 (3697, 1),
 (3699, 2),
 (3806, 1),
 (3866, 3),
 (3887, 1),
 (4040, 1),
 (4125, 1)

In [80]:
# Precompute and save all of this information
n_feats = 5000
# j = [{'id':1, u'title': u'this is a title', u'selftext': u'punchline_e'},{'id':2,u'title': u'this is a title2', u'selftext': u'punchline_d'},
#      {'id':3, u'title': u'this is a title3', u'selftext': u'punchline_c'},
#      {'id':4, u'title': u'this is a title4', u'selftext': u'punchline_b'},{'id':5, u'title': u'this is a title5', u'selftext': u'punchline_a'}]

j = jokes

#here, we will assign an index for each joke id. This index will help us access data in numpy matrices.
joke_id_to_index = {joke_id:index for index, joke_id in enumerate([d['id'] for d in j])}

#we will also need a dictionary mapping joke titles to joke ids
joke_title_to_id = {name:jid for name, jid in zip([d['title'] for d in j],
                                                     [d['id'] for d in j])}
joke_id_to_title = {v:k for k,v in joke_title_to_id.iteritems()}

#and because it might be useful...
joke_title_to_index = {title:joke_id_to_index[joke_title_to_id[title]] for title in [d['title'] for d in j]}
joke_index_to_title = {v:k for k,v in joke_title_to_index.iteritems()}

# maps joke id to the dictionary representing the joke post
joke_id_to_joke = {joke_id:joke for joke_id, joke in [(d['id'], d) for d in j]}


#printing
# print joke_id_to_index
# print joke_title_to_id
# print joke_id_to_title
# print joke_title_to_index
# print joke_index_to_title

# I think order of rows in tfidf is same as order of docs in the list j.
# I think order of cols in tfidf is same as order of what get_feature_names, which I think is in alpha order.
tfidf, feat_names = build_tfidf(j, True, True, n_feats, min_df = 0, max_df = 1)
# List of tuples. 0th entry in tuple is the joke_id
inv_idx = get_inverted_index(j, True, True)

print(tfidf, feat_names)


# sims, like tfidf, is a sparse (CSR) matrix
tfidf_t = tfidf.transpose()
# print tfidf.shape
# print tfidf_t.shape

row1 = tfidf.getrow(0)
row2 =  tfidf.getrow(1)

# sims is sparse
sims = tfidf * tfidf_t


#print a[0].toarray()
# print a[1]
# print '\n'
# b =  build_tfidf(j, True, False, n_feats, min_df = 0, max_df = 1)
# print b[0].toarray()
# print b[1]
# print '\n'
# c = build_tfidf(j, False, True, n_feats, min_df = 0, max_df = 1)
# print c[0].toarray()
# print c[1]
# print '\n'
# Causes runtime error b/c there is clearly no text to consider at all.
#print build_tfidf(j, False, False, n_feats, min_df = 0, max_df = 1) 

(<25000x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 5000 stored elements in Compressed Sparse Row format>, {0: u'00000000', 1: u'00100111', 2: u'00101110', 3: u'00101111', 4: u'01010001', 5: u'01100010', 6: u'01100110', 7: u'01110000', 8: u'01110101', 9: u'01110111', 10: u'01111001', 11: u'112lbs', 12: u'1216', 13: u'1217', 14: u'132', 15: u'156', 16: u'1922', 17: u'1fuckingprettyrose', 18: u'217', 19: u'25years', 20: u'274', 21: u'2m', 22: u'2n', 23: u'3500', 24: u'3m', 25: u'4563', 26: u'486', 27: u'5ft', 28: u'5l', 29: u'7182', 30: u'71828', 31: u'96fm', 32: u'985', 33: u'__', 34: u'____', 35: u'____________________________________________________________________', 36: u'aaaaaaa', 37: u'abbey', 38: u'abe', 39: u'abhijeet', 40: u'abstain', 41: u'adders', 42: u'adel', 43: u'adjutant', 44: u'admin', 45: u'adventist', 46: u'affer', 47: u'afuka', 48: u'afulabi', 49: u'aguilera', 50: u'ahmad', 51: u'airhorn', 52: u'alexis', 53: u'aliquam', 54: u'alloy', 55: u'amet', 56: u'am

In [81]:

def get_sim(title1, title2, sims_mat):
    """
    Arguments:
        title1: The title of the first joke we are looking for.
        title2: The title of the second joke we are looking for.
        sims_mat: calculated as XX^T, where x is doc-by-vocab sparse matrix.
    Returns:
        similarity: Cosine similarity of the two movie transcripts.
    """
    #Code completion 1.2
    
    # Cannot do a simple np.dot on vectors in sparse matrix.
    #return np.dot(doc_by_vocab[movie_name_to_index[mov1],:],
                      #doc_by_vocab[movie_name_to_index[mov2],:])
    
    idx1 = joke_title_to_index[title1]
    idx2 = joke_title_to_index[title2]
    return sims_mat[idx1, idx2]



In [82]:
get_sim('What are minorities?', 'I\'m Trying to Remember The Name of A Song', sims)

0.0

In [83]:
# I wanted to buy an Audi.', 15689: u"I'm Trying to Remember The Name of A Song", 15690: u'What are minorities?', 15691: u'Did you hear that Donald Trump is technically a plant?', 15692: u'Yo mama is so ugly, when she was born the doctor wrapped the afterbirth in a blanket and threw her in the trash.', 15693: u'i had trouble swallowing a viagra last night', 15694: u'What 

In [84]:
print sims[12,14]

0.0


In [85]:
print sims.nonzero()

(array([   11,    13,    18, ..., 24980, 24981, 24993], dtype=int32), array([   11,    13,    18, ..., 24980, 24981, 24993], dtype=int32))


In [86]:
print len(sims.nonzero()[0])

3375


In [87]:
nonzero_elems = zip(sims.nonzero()[0],sims.nonzero()[1])

In [88]:
# print nonzero_elems

for elem in nonzero_elems:
    if elem[0] != elem[1]:
        print '**'

In [89]:




get_sim(joke_index_to_title[208], joke_index_to_title[611], sims)

0.0

In [90]:
print tfidf.nonzero()

(array([   11,    13,    13, ..., 24981, 24981, 24993], dtype=int32), array([4071, 3401, 4649, ..., 3214,  921, 3971], dtype=int32))


In [91]:
print tfidf

  (11, 4071)	1.0
  (13, 3401)	0.534522483825
  (13, 4649)	0.801783725737
  (13, 2165)	0.267261241912
  (18, 3229)	0.707106781187
  (18, 4708)	0.707106781187
  (35, 4730)	1.0
  (40, 3418)	0.99503719021
  (40, 4258)	0.099503719021
  (43, 4639)	1.0
  (55, 837)	1.0
  (56, 1021)	1.0
  (58, 3681)	1.0
  (60, 3072)	0.707106781187
  (60, 1162)	0.707106781187
  (87, 1359)	1.0
  (91, 2610)	1.0
  (92, 1111)	1.0
  (93, 1504)	0.707106781187
  (93, 1552)	0.707106781187
  (95, 3967)	1.0
  (105, 703)	1.0
  (108, 816)	1.0
  (178, 3230)	0.707106781187
  (178, 1129)	0.707106781187
  :	:
  (24866, 3156)	1.0
  (24873, 2363)	1.0
  (24874, 252)	1.0
  (24876, 3212)	1.0
  (24880, 2341)	1.0
  (24882, 2790)	1.0
  (24883, 1596)	1.0
  (24891, 3429)	1.0
  (24896, 1995)	1.0
  (24900, 1417)	1.0
  (24909, 2832)	1.0
  (24917, 688)	1.0
  (24932, 1756)	1.0
  (24933, 1646)	1.0
  (24952, 4591)	1.0
  (24956, 1367)	1.0
  (24965, 1652)	1.0
  (24967, 941)	1.0
  (24971, 2420)	1.0
  (24980, 337)	0.894427191
  (24980, 1415)	0.4472

In [92]:
joke_index_to_title[40]

u'Julia was organizing a cat show'

In [93]:
joke_index_to_title[3418]

u"What's the difference between toilet paper and finding someone attractive on Tinder?"

In [94]:
print type(tfidf)

<class 'scipy.sparse.csr.csr_matrix'>


In [95]:
def inv_idx_cats(cats, list_jokes):
    cat_idx = defaultdict(list)
    for cat in cats:
        if cat == 'nsfw':
            cat_idx[cat] = [joke['id'] for joke in list_jokes if (joke['over_18'] or joke['domain'].lower() == 'self.meanjokes' or joke['domain'].lower() == 'self.dirtyjokes')]
    return cat_idx

In [96]:
inv_idx_cats(['nsfw'], [j[1]])

defaultdict(list, {'nsfw': []})

In [97]:
j[1]

{u'created_utc': 1488987334,
 u'domain': u'self.Jokes',
 u'id': u'5y8jyn',
 u'over_18': False,
 u'selftext': u'I told her that she smells different than she used to.',
 u'title': u'My friend got plastic surgery on her nose...'}

In [98]:
# Artifically make true for testing purposes:
j[19]['over_18'] = True
print j[19]

{u'domain': u'self.Jokes', u'title': u"What's the fastest thing in the universe?", u'created_utc': 1488985374, u'selftext': u'A woman running away from a mouse', u'over_18': True, u'id': u'5y8d7p'}


In [99]:
inv_idx_cats(['nsfw'], [j[1], j[19]])

defaultdict(list, {'nsfw': [u'5y8d7p']})

In [100]:
inv_idx_cats(['nsfw'], j)

defaultdict(list,
            {'nsfw': [u'5y8d7p',
              u'5y7bx1',
              u'5y79z5',
              u'5y5p1z',
              u'5y48hp',
              u'5y3aem',
              u'5y03w4',
              u'5xzaua',
              u'5xx6q4',
              u'5xwv6h',
              u'5xwjl7',
              u'5xtv66',
              u'5xsg1b',
              u'5xqzl2',
              u'5xqp3v',
              u'5xpuek',
              u'5xpc3v',
              u'5xp43n',
              u'5xnvxl',
              u'5xn9ty',
              u'5xl2zr',
              u'5xhyhw',
              u'5xhdyq',
              u'5xepdi',
              u'5xbpz3',
              u'5xbfzx',
              u'5xb0xq',
              u'5xahaz',
              u'5x969i',
              u'5x8ymr',
              u'5x8pfn',
              u'5x578m',
              u'5x3z0n',
              u'5x3whw',
              u'5x3flt',
              u'5x3ezb',
              u'5x2wwn',
              u'5x2wbh',
              u'5x2uuk',

In [101]:
print 'nsfw joke count: '
print len(inv_idx_cats(['nsfw'], j)['nsfw'])
print 'total joke count: '
print len(j)

nsfw joke count: 
221
total joke count: 
25000


In [102]:
joke_id_to_title['5u9lex']

u'NSFW Found an old picture of me and my beautiful girlfriend doing it'

In [104]:
joke_id_to_joke['5u9lex']

{u'created_utc': 1487186346,
 u'domain': u'self.Jokes',
 u'id': u'5u9lex',
 u'over_18': True,
 u'selftext': u'It was a comely photo',
 u'title': u'NSFW Found an old picture of me and my beautiful girlfriend doing it'}

In [105]:
joke_id_to_index['5u9lex']

14437

In [106]:
j[14437]

{u'created_utc': 1487186346,
 u'domain': u'self.Jokes',
 u'id': u'5u9lex',
 u'over_18': True,
 u'selftext': u'It was a comely photo',
 u'title': u'NSFW Found an old picture of me and my beautiful girlfriend doing it'}

In [114]:
j[135]

{u'created_utc': 1488966579,
 u'domain': u'self.Jokes',
 u'id': u'5y72ed',
 u'over_18': False,
 u'selftext': u"More like National Dishwasher's Day",
 u'title': u"National Women's Day......"}

In [115]:
def get(json_input):
    query = json.load(json_input)
    # IR stuff    
    
    json_output = [None for i in xrange(len(ranked_list))]
    i = 0
    # assume entries in ranked_list are just joke ids.
    
    for jid in ranked_list:
        r_joke = joke_id_to_joke[jid]
        tmp_dict = {'title':r_joke['title'], 'selftext': r_joke['selftext']}
        json_output[i] = json.dumps(tmp_dict)
        i += 1
    return json_output

In [116]:
get(None)

['{"selftext": "It was a comely photo", "title": "NSFW Found an old picture of me and my beautiful girlfriend doing it"}',
 '{"selftext": "More like National Dishwasher\'s Day", "title": "National Women\'s Day......"}']