## Recommender System - Marketplace Matching

In this notebook, we will: 
- Clean textual data from user-input verbatim posts
- Use a Word2Vec model to calculate document similarities 
- Sort the most similar user input to our training data in order to recommend similar products 
- Save this model in a format that allows us to refresh the testing data

The goal of this project is to create a recommender system to help Pangeans find the right "project" for them, given their profile information. Here, we are using legacy data that is from Pangea V2, when Pangeans were allowed to post services and requests, as well as purchase items on the platform. We are using the User-Inputted Titles to suggest similar services, or in V3, similar "projects". 

In [1]:
#Importing Libraries
import numpy as np
import pandas as pd

import sys
from pandas import DataFrame

import json
from pandas.io.json import json_normalize
import csv

import matplotlib.pyplot as plt
%matplotlib inline 
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from gensim import corpora
from collections import defaultdict
from pprint import pprint

from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts

import re
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords') ###

import gensim 
from gensim.models import KeyedVectors
import gensim.downloader as api

from operator import itemgetter, attrgetter

from gensim.models.doc2vec import Doc2Vec

import os, sys

from operator import add

from sklearn.preprocessing import normalize

import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

import pickle

import json
from pprint import pprint
from pandas.io.json import json_normalize

import operator

import csv

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremieharris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [3]:
def vectorize_and_store_existing_titles():
    
    raw = pd.read_csv("allPostData.csv", header=0);
    
    #we can replace this with a filepath in the future
    titles = raw['title'];
    post_titles = [title for title in titles];
    post_titles = set(post_titles);
    
    tokens = [[word for word in title.lower().split()] for title in post_titles];
    
    clean_words = [[word for word in title if word.isalpha()] for title in tokens];
    stoplist = set(stopwords.words('english'));
    
    titles_nostopwords = [[word for word in title if word not in stoplist] for title in clean_words];   
    #print(len(titles_nostopwords))
    
    filtered_word_list = [[word for word in title if word in model.vocab] for title in titles_nostopwords];
    #print(len(filtered_word_list))
    
    dictionary = dict(zip(post_titles, filtered_word_list))
    
    #title_vectors = {}
    
    #print(len(dictionary.keys()))
    
    #print(len(set(titles)))
    #print(len(set(post_titles)))
    #dupe reqs 
    
    vectorized_titles = pd.DataFrame(columns=["Titles", "Vectors"])
    #print(vectorized_titles)
    
    
    
    for title in post_titles: 
        word_vecs = [model[word] for word in dictionary[title]]
        #print(len(word_vecs))
        if len(word_vecs) == 0:
            title_vec = [np.zeros(300)]
        else: 
            title_vec = normalize(sum(word_vecs).reshape(1, -1))
            #print(title_vec)
       
        vectorized_titles = vectorized_titles.append({'Titles': title, 'Vectors': title_vec}, ignore_index=True)
       
    
    
    #dictionary = dict(zip(post_titles, filtered_word_list))

    #can also replace filepath in the future
    #titles.to_pickle("/Users/angelateng/Dropbox/AMBER/SharpestMinds/raw_titles.pkl")
    vectorized_titles.to_pickle("./vectorized_titles.pkl")
    #print(vectorized_titles)
    #print(title_vec)
    #print(word_vecs)
    return(vectorized_titles)

#vectorize_and_store_existing_titles()


In [4]:
#pd.read_pickle("/Users/angelateng/Dropbox/AMBER/SharpestMinds/vectorized_titles.pkl")
#sanity check
vectorize_and_store_existing_titles()

Unnamed: 0,Titles,Vectors
0,Removal of Waterbed,"[[-0.050746784, 0.11636762, 0.0019412832, 0.04..."
1,Need Fabrics,"[[0.036967564, 0.024140665, -0.027652793, 0.04..."
2,Logo design,"[[-0.022615831, 0.023761567, -0.00044833144, 0..."
3,Tomato Soup Recipe,"[[-0.08257644, 0.059555326, 0.034466546, 0.126..."
4,Fantasy Baseball Advising,"[[0.031859405, 0.038249034, 0.069930956, -0.02..."
5,Social Media Marketing Consultation,"[[0.010197233, -0.01704501, -0.08429465, -0.03..."
6,SAS Advice!,"[[-0.05032597, -0.07867554, -0.038458712, 0.01..."
7,Smash 64 Practice,"[[0.062727526, 0.018316587, -0.04743604, -0.00..."
8,SAS / SQL Tutor,"[[0.025888918, -0.05742222, -0.053885072, 0.06..."
9,Breakdance sessions,"[[-0.004240162, 0.031446796, -0.0024745017, 0...."


In [5]:
#can change this directory later


#test post 
with open('firstPost.json') as fresh_data:
    user_post = json.load(fresh_data)

#user_post = json.loads('firstPost.json')
#print(user_post)

#should only take in a single post at a time 

def vectorize_new_title(user_post):
    json_df = pd.DataFrame.from_dict(json_normalize(user_post), orient='columns')
    #print(json_df)
    
    title = json_df["title"][0] #--> added [0] because json_df["title"] was still in pd.Series form
    
    #json_post_titles = [title in title];
    #json_post_titles = [title for title in title]; --> removed because we only have one title
    json_tokens = [word for word in title.lower().split()]# for title in json_post_titles]; --> removed because we only have one title
    #json_post_titles = set(json_post_titles)
    #print(json_post_titles)
    
    
    
    #json_clean_words = [[word for word in title if word.isalpha()] for title in json_post_titles];
    json_clean_words = [word for word in json_tokens if word.isalpha()]# for title in json_tokens]; --> removed because we only have one title
    #print(json_clean_words)
    
    stoplist = set(stopwords.words('english'));
    #json_titles_nostopwords = [[word for word in title if word not in stoplist] for title in json_clean_words] --> removed and updated below (because only have one title)
    json_titles_nostopwords = [word for word in json_clean_words if word not in stoplist]   

    
    #json_filtered_word_list = [[word for word in title if word in model.vocab] for title in json_titles_nostopwords]; --> same as above
    json_preprocessed = [word for word in json_titles_nostopwords if word in model.vocab]   

    #print(json_filtered_word_list)
        
    json_title_vectors = {}
    
    json_vectorized_title_df = pd.DataFrame(columns=["Titles", "Vectors"]) #--> updated because only one title
    #print(json_vectorized_titles)
    
    # --> we don't need a loop over all titles now, since there's just one:
    json_word_vecs = [model[word] for word in json_preprocessed]
    if len(json_preprocessed) == 0:
            json_title_vec = [np.zeros(300)]
    else: 
        json_title_vec = normalize(sum(json_word_vecs).reshape(1, -1))


    json_vectorized_title_df = json_vectorized_title_df.append({'Titles': title, 'Vectors': json_title_vec}, ignore_index=True)
    return(json_vectorized_title_df)

#vectorize_new_title(json_df)
#why is it a dictionary in a dictionary?? double brackets

In [6]:
vectorized_title = vectorize_new_title(user_post)
#print(vectorized_title)

def rank_existing_titles(vectorized_title):
    #loop over all keys in dict 
    ranked_titles = {}
    #other_titles = pd.read_pickle("/Users/angelateng/Dropbox/SharpestMinds/raw_titles.pkl")
    other_titles = pd.read_pickle("./vectorized_titles.pkl")

    #can also use title_vectors.keys() 
    for index,row in other_titles.iterrows():
        #print('vectorized_title', vectorized_title['Vectors'][0])
        #print('other_titles', row['Vectors'])


        ranked_titles[row['Titles']] = sum(row['Vectors'][0]*vectorized_title['Vectors'][0][0]) # --> did the dot product using sum() and * because np.dot was behaving weirdly for some reason. Now it seems to work!
        #print(ranked_titles[row['Titles']])
        
    sorted_title_vecs = sorted(ranked_titles.items(), key=operator.itemgetter(1), reverse=True)
    return(sorted_title_vecs)

#rank_existing_titles(vectorized_title[:10])
#word querying

#should be cleaned titles x cleaned til

In [7]:
rank_existing_titles(vectorized_title)

[('Teach me how to cook quick meals!', 0.9070961982678227),
 ('Teach Me How To Cook!', 0.7957720306183091),
 ('Teach how to prepare steaks ', 0.7433686636395578),
 ('Private Cook ', 0.7177446654708319),
 ('Private cook', 0.7177446654708319),
 ('Cooking Lessons ', 0.6707953416986925),
 ('Cooking Lessons', 0.6707953416986925),
 ('Cooking lessons', 0.6707953416986925),
 ('Cooking', 0.6634962084033305),
 ('Teach me how to play piano', 0.6475645409999515),
 ('Teach how to make tea', 0.6299678694610975),
 ('Culinary tutor', 0.5939296423903215),
 ('Baking Lessons ', 0.5822712795805955),
 ('Help me learn Pop/Stars dance', 0.5706981944497329),
 ('Mediterranean Cooking Class', 0.5702011714132595),
 ('Teach you how to shuffle', 0.5699134945516562),
 ('Learn To Juggle!', 0.5561230392829657),
 ('Teach Me How To Solve Rubiks Cube', 0.55600204987811),
 ('Brazilian Cooking Class', 0.5491360732714838),
 ('Chef Prepared Meal Prep', 0.5327085255917154),
 ('Teach me meme culture', 0.5040128903044376),
 ('

In [8]:
def generate_recommendations(user_post):
    title = user_post["title"]
    #print(title)
    vectorized_title = vectorize_new_title(user_post)
    #print(vectorized_title)
    ranked_titles = rank_existing_titles(vectorized_title)
    #ranked_titles = rank_existing_titles(title, vectorized_title)
    #print(ranked_titles)
    
    
    with open("./ranked_titles.csv", "w", newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for title in ranked_titles:
            wr.writerow([ranked_titles, title])
    #csv_out = ranked_titles.to_csv('ranked_titles', encoding='utf-8', index=False)
    #writer = csv.writer(ranked_titles, delimiter='', quotechar='"', quoting=csv.QUOTE_ALL)
 
    #for row in reader:
        #writer.writerow(row)

       # with open('csv_out', 'wb') as csv_out:
         #   csv_writer = csv.writer(csv_out, quoting=csv.QUOTE_ALL)
         #   csv_writer.writerow('csv_out')
    return(ranked_titles)
    
    

#vectorize_and_store_existing_titles(); 
#vectorize_new_title(json_df);
#rank_existing_titles(vectorize_new_title);

In [9]:
#generate_recommendations(json_df)
pprint(generate_recommendations(user_post))

[('Teach me how to cook quick meals!', 0.9070961982678227),
 ('Teach Me How To Cook!', 0.7957720306183091),
 ('Teach how to prepare steaks ', 0.7433686636395578),
 ('Private Cook ', 0.7177446654708319),
 ('Private cook', 0.7177446654708319),
 ('Cooking Lessons ', 0.6707953416986925),
 ('Cooking Lessons', 0.6707953416986925),
 ('Cooking lessons', 0.6707953416986925),
 ('Cooking', 0.6634962084033305),
 ('Teach me how to play piano', 0.6475645409999515),
 ('Teach how to make tea', 0.6299678694610975),
 ('Culinary tutor', 0.5939296423903215),
 ('Baking Lessons ', 0.5822712795805955),
 ('Help me learn Pop/Stars dance', 0.5706981944497329),
 ('Mediterranean Cooking Class', 0.5702011714132595),
 ('Teach you how to shuffle', 0.5699134945516562),
 ('Learn To Juggle!', 0.5561230392829657),
 ('Teach Me How To Solve Rubiks Cube', 0.55600204987811),
 ('Brazilian Cooking Class', 0.5491360732714838),
 ('Chef Prepared Meal Prep', 0.5327085255917154),
 ('Teach me meme culture', 0.5040128903044376),
 ('