## Recommender System - Marketplace Matching

In this notebook, we will: 
- Clean textual data from user-input verbatim posts
- Use a Word2Vec model to calculate document similarities 
- Sort the most similar user input to our training data in order to recommend similar products 
- Save this model in a format that allows us to refresh the testing data

The goal of this project is to create a recommender system to help Pangeans find the right "project" for them, given their profile information. Here, we are using legacy data that is from Pangea V2, when Pangeans were allowed to post services and requests, as well as purchase items on the platform. We are using the User-Inputted Titles to suggest similar services, or in V3, similar "projects". 

In [1]:
'''Importing Libraries'''
import numpy as np
import pandas as pd

import sys
from pandas import DataFrame

import json
from pandas.io.json import json_normalize


import matplotlib.pyplot as plt
%matplotlib inline 
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from gensim import corpora
from collections import defaultdict
from pprint import pprint

from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts

import re
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords') ###
from nltk.tokenize import RegexpTokenizer

import gensim 
from gensim.models import KeyedVectors
import gensim.downloader as api

from operator import itemgetter, attrgetter

from gensim.models.doc2vec import Doc2Vec

import os, sys

from operator import add

from sklearn.preprocessing import normalize

import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from adjustText import adjust_text

from sklearn.manifold import TSNE

import pickle

import json
from pprint import pprint
from pandas.io.json import json_normalize

import operator

import csv

import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angelateng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
'''Loading the model'''
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [4]:
'''Vectorize and store existing titles in legacy Pangea database'''
def vectorize_and_store_existing_titles(): 
    raw = pd.read_csv("allPostData.csv", header=0);
    titles = raw['title'];
    post_titles = [title for title in titles];
    post_titles = set(post_titles); 
    tokens = [[word for word in title.lower().split()] for title in post_titles];
    clean_words = [[word.translate(str.maketrans('', '', string.punctuation)) for word in title] for title in tokens]
    #print(clean_words)
    #clean_words = [[word for word in title if word.isalpha()] for title in tokens];
    #tokenizer = RegexpTokenizer(r'\w+')
    
    stoplist = set(stopwords.words('english'));
    titles_nostopwords = [[word for word in title if word not in stoplist] for title in clean_words];   
    filtered_word_list = [[word for word in title if word in model.vocab] for title in titles_nostopwords];
    dictionary = dict(zip(post_titles, filtered_word_list))
    vectorized_titles = pd.DataFrame(columns=["Titles", "Vectors"])
    for title in post_titles: 
        word_vecs = [model[word] for word in dictionary[title]]
        if len(word_vecs) == 0:
            title_vec = [np.zeros(300)]
        else: 
            title_vec = normalize(sum(word_vecs).reshape(1, -1))
        vectorized_titles = vectorized_titles.append({'Titles': title, 'Vectors': title_vec}, ignore_index=True)
    vectorized_titles.to_pickle("/Users/angelateng/Google_Drive/SharpestMinds_dropbox/SharpestMinds/vectorized_titles.pkl")
    return(vectorized_titles)


In [5]:
'''Print a dataframe to show how these titles and their vectors will be saved'''
'''Also note that we are saving the df with the original raw (not cleaned) titles'''
vectorize_and_store_existing_titles()

Unnamed: 0,Titles,Vectors
0,Spiritual Talks,"[[0.10162811, 0.051215217, 0.074616425, -0.027..."
1,Design The Coolest Debit Card For Kids/Teens,"[[-0.06146582, 0.02307342, 0.050308917, 0.1127..."
2,App Development Lessons,"[[0.024453294, -0.02666793, -0.011280811, -0.0..."
3,Campus Ambassador,"[[0.009185634, 0.018454658, 0.1293173, 0.00410..."
4,Remove Car Stereo,"[[0.006371396, 0.02385082, -0.0422105, -0.0206..."
5,Matcha buddy,"[[0.005861206, -0.031119319, 0.058747493, 0.13..."
6,Fujifilm Instax Mini Camera 25,"[[-0.023258874, 0.034682952, 0.015668174, 0.02..."
7,Melee Tutoring,"[[0.11666875, 0.05078485, 0.022796279, 0.00778..."
8,Overwatch coach,"[[-0.027847748, -0.020765299, -0.0050430014, -..."
9,Housekeeping Wanted,"[[0.0006252811, 0.08929014, 0.002251012, -0.06..."


In [6]:
'''Read a JSON file when a new user pings with a request'''
with open('firstPost.json') as fresh_data:
    user_post = json.load(fresh_data)

'''Vectorize each new title as a user/student/company creates a new post'''
def vectorize_new_title(user_post):
    #ranked_titles_load = pd.read_csv("./ranked_titles.csv")
    json_df = pd.DataFrame.from_dict(json_normalize(user_post), orient='columns')
    title = json_df["title"][0] #--> added [0] because json_df["title"] was still in pd.Series form
    json_tokens = [word for word in title.lower().split()]
    #json_clean_words = [[word for word in title.lower().split()] for title in post_titles];
    json_clean_words = [word.translate(str.maketrans('', '', string.punctuation)) for word in json_tokens]
    #print(json_clean_words)
    
    stoplist = set(stopwords.words('english'));
    json_titles_nostopwords = [word for word in json_clean_words if word not in stoplist]   
    json_preprocessed = [word for word in json_titles_nostopwords if word in model.vocab]   
    json_title_vectors = {}
    json_vectorized_title_df = pd.DataFrame(columns=["Titles", "Vectors"])
    json_word_vecs = [model[word] for word in json_preprocessed]
    if len(json_preprocessed) == 0:
            json_title_vec = [np.zeros(300)]
    else: 
        json_title_vec = normalize(sum(json_word_vecs).reshape(1, -1))
    json_vectorized_title_df = json_vectorized_title_df.append({'Titles': title, 'Vectors': json_title_vec}, ignore_index = True)
    #export_csv = json_vectorized_title_df.to_csv (r'/Users/angelateng/Dropbox/AMBER/SharpestMinds/ranked_titles.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path
    if not os.path.isfile('/Users/angelateng/Google_Drive/SharpestMinds_dropbox/SharpestMinds/ranked_titles.csv'):
        json_vectorized_title_df.to_csv (r'/Users/angelateng/Google_Drive/SharpestMinds_dropbox/SharpestMinds/ranked_titles.csv', index = None, header=True)
    else:
        json_vectorized_title_df.to_csv (r'/Users/angelateng/Google_Drive/SharpestMinds_dropbox/SharpestMinds/ranked_titles.csv', mode='a', index = None, header=False)
    
    return(json_vectorized_title_df)

In [10]:
'''Append new user-post title to csv'''
vectorized_title = vectorize_new_title(user_post)
print(vectorized_title)

                  Titles                                            Vectors
0  Teach Me How To Cook!  [[-0.022974137, 0.010501054, 0.05975248, 0.041...


In [11]:
print(user_post)

{'address': {'components': {'city': 'Providence', 'country': 'US', 'county': 'Providence County', 'number': '158', 'state': 'RI', 'street': 'University Ave', 'zip_code': '02906'}, 'formatted': '158 University Ave, Providence, RI 02906, USA'}, 'archived_status': 'live', 'category': 'Cooking', 'content_type': 'service', 'created_at': {'$reql_type$': 'TIME', 'epoch_time': 1505340466, 'timezone': '+00:00'}, 'description': "I'm trying to broaden the repertoire of culinary dishes I can make. Please help! I'm surviving on chicken and pasta rn... I'll buy the ingredients.", 'id': '-Kty6gXaY-_wgIheV3LS', 'last_modified': '2018-09-07T23:08:39.964Z', 'location': {'$reql_type$': 'GEOMETRY', 'coordinates': [-71.3877064, 41.8342705], 'type': 'Point'}, 'owner_id': 'RHCWoC5WnSTXi0JWJadv18ejPwy1', 'payment_type': 'fixed', 'photos': [], 'post_type': 'request', 'price': 25, 'price_type': 'flexible', 'status': 'legacy', 'title': 'Teach Me How To Cook!', 'visibility': 'public'}


In [63]:
'''Load the current titles in the Pangea database, and then rank them by similarity to the latest user query'''
def rank_existing_titles(vectorized_title):
    ranked_titles = {}
    other_titles = pd.read_pickle("./vectorized_titles.pkl")
    #print(other_titles)
    for index,row in other_titles.iterrows():
        ranked_titles[row['Titles']] = sum(row['Vectors'][0]*vectorized_title['Vectors'][0][0]) # --> did the dot product using sum() and * because np.dot was behaving weirdly for some reason. Now it seems to work! 
    sorted_title_vecs = sorted(ranked_titles.items(), key=operator.itemgetter(1), reverse=True)
    return(sorted_title_vecs)

#should this also be named ranked titles? or should we rename this line? since we're 
#already loading them in the cell above ??


In [64]:
rank_existing_titles(vectorized_title)

[('Teach Me How To Cook!', 1.0000000256776254),
 ('Teach me how to cook quick meals!', 0.8501101096892398),
 ('Teach how to prepare steaks ', 0.7433686636395578),
 ('Private cook', 0.7177446654708319),
 ('Private Cook ', 0.7177446654708319),
 ('Cooking lessons', 0.6707953416986925),
 ('Cooking Lessons', 0.6707953416986925),
 ('Cooking Lessons ', 0.6707953416986925),
 ('Cooking', 0.6634962084033305),
 ('Teach me how to play piano', 0.6475645409999515),
 ('Teach how to make tea', 0.6299678694610975),
 ('Culinary tutor', 0.5939296423903215),
 ('Baking Lessons ', 0.5822712795805955),
 ('Mediterranean Cooking Class', 0.5702011714132595),
 ('Teach you how to shuffle', 0.5699134945516562),
 ('Teach Me How To Solve Rubiks Cube', 0.55600204987811),
 ('Brazilian Cooking Class', 0.5491360732714838),
 ('Learn To Juggle!', 0.5355930420098503),
 ('Chef Prepared Meal Prep', 0.5327085255917154),
 ('Help me learn Pop/Stars dance', 0.5109324808180418),
 ('Teach me meme culture', 0.5040128903044376),
 ('

In [70]:
'''Load the current titles in the Pangea database, and then rank them by similarity to the latest user query'''
def rank_existing_titles(vectorized_title):
    ranked_titles = {}
    other_titles = pd.read_pickle("./vectorized_titles.pkl")
    #print(other_titles)
    for index,row in other_titles.iterrows():
        ranked_titles[row['Titles']] = sum((row['Vectors'][0])*(vectorized_title.iloc[1, 0:].values)) 
    #sorted_title_vecs = sorted(ranked_titles.items(), key=operator.itemgetter(1), reverse=True)
    #sorted_title_vecs = sorted_title_vecs.all()
    #return(sorted_title_vecs.all())
    return(other_titles)

#should this also be named ranked titles? or should we rename this line? since we're 
#already loading them in the cell above ??
rank_existing_titles(vectorized_title)

IndexError: single positional indexer is out-of-bounds

In [43]:
#vectorized_title.loc[0]
vectorized_title.loc[0][1]
#vectorized_title.loc['Vector'][0][0]
#other_titles = pd.read_pickle("./vectorized_titles.pkl")
#type(other_titles['Vectors'][0])


array([[-2.29741372e-02,  1.05010541e-02,  5.97524792e-02,
         4.16098125e-02,  2.97776368e-02,  1.00179069e-01,
         6.07384928e-02, -4.60468754e-02, -8.77552852e-02,
        -3.14045623e-02,  5.07797450e-02, -1.04517536e-02,
         9.86014493e-05,  4.02170643e-02, -4.64412831e-02,
         1.07278377e-01, -2.32699420e-02,  2.42559556e-02,
        -2.31713406e-03, -5.24559692e-02,  9.70238224e-02,
         5.40335923e-02,  1.34492382e-01,  4.77724001e-02,
        -1.77482609e-02, -7.41482899e-02, -8.57370382e-04,
        -3.94405797e-02, -7.93741643e-02, -2.45517604e-02,
         5.91608696e-04,  3.44119072e-02, -7.96699673e-02,
        -3.74685489e-02, -2.57842783e-02, -1.47778923e-02,
         8.80264416e-02, -2.21113749e-02, -8.28252174e-03,
         1.87342754e-03,  9.33016185e-03, -3.51021141e-02,
         8.22336078e-02, -6.16259035e-03, -3.43626030e-02,
        -3.66797373e-02, -1.09250404e-01,  1.01806000e-02,
        -2.78425831e-02,  4.56524715e-02, -8.31210241e-0

In [11]:
'''Final function call API that puts together the prior 3 functions in a neat mega-function'''
def generate_recommendations(user_post):
    title = user_post["title"]
    vectorized_title = vectorize_new_title(user_post)
    #ranked_titles = rank_existing_titles(vectorized_title) 
    #pprint(ranked_titles)
    ranked_titles = pd.DataFrame(rank_existing_titles(vectorized_title), columns = ["Title", "Similarity Score"])
    with open("./ranked_titles.csv", "w", newline='') as myfile:
        wr = csv.writer(myfile,  delimiter=',', quoting=csv.QUOTE_ALL)
        for title in ranked_titles:
            #wr.writerow([ranked_titles, title])
            wr.writerow([ranked_titles])
    return(ranked_titles)

#should this still be ranked titles.csv???


In [12]:
#sanity check for final mega function
generate_recommendations(user_post)

Unnamed: 0,Title,Similarity Score
0,Teach Me How To Cook!,1.000000
1,Teach me how to cook quick meals!,0.850110
2,Teach how to prepare steaks,0.743369
3,Private Cook,0.717745
4,Private cook,0.717745
5,Cooking Lessons,0.670795
6,Cooking lessons,0.670795
7,Cooking Lessons,0.670795
8,Cooking,0.663496
9,Teach me how to play piano,0.647565


In [13]:
#sanity check
df = pd.DataFrame(generate_recommendations(user_post))
df.columns = ["Title", "Similarity Score"]
print(df)

                                 Title  Similarity Score
0                Teach Me How To Cook!          1.000000
1    Teach me how to cook quick meals!          0.850110
2         Teach how to prepare steaks           0.743369
3                        Private Cook           0.717745
4                         Private cook          0.717745
5                      Cooking Lessons          0.670795
6                      Cooking lessons          0.670795
7                     Cooking Lessons           0.670795
8                              Cooking          0.663496
9           Teach me how to play piano          0.647565
10               Teach how to make tea          0.629968
11                      Culinary tutor          0.593930
12                     Baking Lessons           0.582271
13         Mediterranean Cooking Class          0.570201
14            Teach you how to shuffle          0.569913
15   Teach Me How To Solve Rubiks Cube          0.556002
16             Brazilian Cookin

In [5]:
other_titles = pd.read_pickle("./vectorized_titles.pkl")

In [10]:
other_titles

Unnamed: 0,Titles,Vectors
0,Waterpolo Tutoring,"[[0.004673245, 0.06561235, 0.028039468, 0.0306..."
1,Eliminate Russell Carey & Replace Him With a R...,"[[-0.04523085, 0.047022626, 0.030563282, 0.070..."
2,Bosnian/Croatian Language Tutor,"[[0.01473858, -0.013819061, 0.02217355, 0.0680..."
3,Portrait Paintings,"[[0.078219034, 0.06146943, -0.020530464, 0.031..."
4,GRADUATION PHOTOS!,"[[-0.020564673, 0.12495787, -0.04615278, 0.043..."
5,Need Help In History,"[[0.034072954, 0.067304604, 0.025347782, 0.039..."
6,tennis stringing,"[[-0.07873873, 0.025344653, -0.0038067068, 0.0..."
7,Personal organizer,"[[0.09288177, -0.03567334, 0.038201373, -0.036..."
8,Cs tutor,"[[-0.026808849, 0.018503362, 0.04615327, 0.039..."
9,Rides to the Station,"[[-0.010272413, 0.016525187, 0.010942353, 0.02..."
