In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
#include libraries
import csv # to open/close/append CSV
import os # to check if file exists
import nltk #natural language toolkit
from datetime import datetime
import re
from collections import Counter, defaultdict 
from glob import glob
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')

In [14]:
user_rating_df = pd.read_csv('/Users/boris/Downloads/user_rating_dice.csv', sep=',', header='infer')
user_rating_df.head(5)

Unnamed: 0,uniq_id,eventType,user_id
0,fffe1767dbc1713944851a0a4f02ec5b,LIKE,101131
1,fffe1767dbc1713944851a0a4f02ec5b,LIKE,100110
2,fffe1767dbc1713944851a0a4f02ec5b,VIEW,100110
3,fffd6e0361c1aecb4d099f6465392a77,FOLLOW,101320
4,fffd6e0361c1aecb4d099f6465392a77,FOLLOW,100320


In [3]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT': 5.0,  
}

In [15]:
user_rating_df['eventStrength'] = user_rating_df['eventType'].apply(lambda x: event_type_strength[x])

In [16]:
user_rating_df.head(5)

Unnamed: 0,uniq_id,eventType,user_id,eventStrength
0,fffe1767dbc1713944851a0a4f02ec5b,LIKE,101131,2.0
1,fffe1767dbc1713944851a0a4f02ec5b,LIKE,100110,2.0
2,fffe1767dbc1713944851a0a4f02ec5b,VIEW,100110,1.0
3,fffd6e0361c1aecb4d099f6465392a77,FOLLOW,101320,3.0
4,fffd6e0361c1aecb4d099f6465392a77,FOLLOW,100320,3.0


In [17]:
user_rating_count_df = user_rating_df.groupby(['user_id', 'uniq_id']).size().groupby('user_id').size()
print('# users: %d' % len(user_rating_count_df))
user_with_enough_ratings_df = user_rating_count_df[user_rating_count_df >= 25].reset_index()[['user_id']]
print('# users with at least 25 ratings: %d' % len(user_with_enough_ratings_df))

# users: 1501
# users with at least 25 ratings: 1209


In [8]:
CSV_file_df = pd.read_csv('/Users/boris/Downloads/dice_com-job_us_sample.csv', sep=',', header='infer')

In [9]:
CSV_file2_df = CSV_file_df.apply(lambda x: x.astype(str).str.lower())

In [10]:
CSV_file2_df['jobid'] = CSV_file2_df['jobid'].map(lambda x: x.lstrip('dice id :'))

In [11]:
from nltk.corpus import stopwords

In [12]:
stop_words = set(stopwords.words('english'))
CSV_file2_df['Job_Description_Without_Stopwords'] = CSV_file2_df['jobdescription'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
CSV_file2_df['Job_Skills_Without_Stopwords'] = CSV_file2_df['skills'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


#regular expresion in wich we swap all special charachters with a blank and all numbers with a blank also
def clean_string(strings):
    result = []
    for value in strings:
        value = value.strip()
        value = re.sub('([!?\',*+.$-/])',' ', value)
        value = re.sub("\d+", "", value)
        result.append(value)
    return result

# creates list where each document is an element
CSV_file2_df['Job_Description_Without_Stopwords'] = clean_string(CSV_file2_df['Job_Description_Without_Stopwords'])
CSV_file2_df['Job_Skills_Without_Stopwords'] = clean_string(CSV_file2_df['Job_Skills_Without_Stopwords'])

In [None]:
#ovo gore sam mogao i posle

In [18]:
print('# of interactions: %d' % len(user_rating_df))
ratings_from_selected_users_df = user_rating.merge(user_with_enough_ratings_df, 
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')
print('# of interactions from users with at least 25 interactions: %d' % len(ratings_from_selected_users_df))

# of interactions: 66000
# of interactions from users with at least 25 interactions: 56472


In [20]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
rating_full_df = ratings_from_selected_users_df \
                    .groupby(['user_id', 'uniq_id'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item ratings: %d' % len(rating_full_df))
rating_full_df.head(10)

# of unique user/item ratings: 37631


Unnamed: 0,user_id,uniq_id,eventStrength
0,100000,0f350d3a20b61289bd882547210090b4,1.807355
1,100000,29517932e0cc69b7f93196b101ac55fb,3.169925
2,100000,2960929545141233ae4185317727842e,1.807355
3,100000,2b574988cc462c5f5f1d0cfe81db909d,2.321928
4,100000,361514035c6ea8b06d07a285548b4d7a,1.0
5,100000,397a3f1d0d9366ee899b523b36b2b800,1.807355
6,100000,3d4f800b32b1a0a5388bac60bbf27b8c,2.807355
7,100000,3d8b1489c2b2fbacc2b56f899f529185,1.807355
8,100000,3e4e15ab2e9f0543916bc84a82a66166,2.0
9,100000,3f50d0d0c793a0ab1470438ae89cb66a,2.807355


In [21]:
rating_train_df, rating_test_df = train_test_split(rating_full_df,
                                   stratify=rating_full_df['user_id'], 
                                   test_size=0.20,
                                   random_state=42)

print('# ratings on Train set: %d' % len(rating_train_df))
print('# ratings on Test set: %d' % len(rating_test_df))

# ratings on Train set: 30104
# ratings on Test set: 7527


In [22]:
#Indexing by user_id to speed up the searches during evaluation
rating_full_indexed_df = rating_full_df.set_index('user_id')
rating_train_indexed_df = rating_train_df.set_index('user_id')
rating_test_indexed_df = rating_test_df.set_index('user_id')

In [25]:
def get_items_rating(person_id, user_rating_df):
    # Get the user's data and merge in the movie information.
    rating_items = user_rating_df.loc[person_id]['uniq_id']
    return set(rating_items if type(rating_items) == pd.Series else [rating_items])

In [105]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_rating(person_id, rating_full_indexed_df)
        all_items = set(CSV_file2_df['uniq_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = rating_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['uniq_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['uniq_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['uniq_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_rating(person_id, 
                                                                                    rating_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=100000%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['uniq_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['uniq_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print("Running evaluation for users")
        people_metrics = []
        for idx, person_id in enumerate(list(rating_test_indexed_df.index.unique().values)):

            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [33]:
CSV_file2_df

Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id,Job_Description_Without_Stopwords,Job_Skills_Without_Stopwords
0,https://www.dice.com/jobs/detail/automation-te...,"digital intelligence systems, llc","c2h corp-to-corp, c2h independent, c2h w2, 3 m...",looking for selenium engineers...must have sol...,10110693,"atlanta, ga",automation test engineer,1 hour ago,telecommuting not available|travel not required,,see below,418ff92580b270ef4e7c14f0ddfc36b4,looking selenium engineers must solid java c...,see
1,https://www.dice.com/jobs/detail/information-s...,university of chicago/it services,full time,the university of chicago has a rapidly growin...,10114469,"chicago, il",information security engineer,1 week ago,telecommuting not available|travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9,university chicago rapidly growing security pr...,linux unix network monitoring incident respo...
2,https://www.dice.com/jobs/detail/business-solu...,"galaxy systems, inc.",full time,"galaxe.solutionsevery day, our solutions affec...",xgalxys,"schaumburg, il",business solutions architect,2 weeks ago,telecommuting not available|travel not required,,"enterprise solutions architecture, business in...",46baa1f69ac07779274bcd90b85d9a72,galaxe solutionsevery day solutions affect pe...,enterprise solutions architecture business in...
3,https://www.dice.com/jobs/detail/java-develope...,transtech llc,full time,java developerfull-time/direct-hirebolingbrook...,10113627,"bolingbrook, il","java developer (mid level)- ft- great culture,...",2 weeks ago,telecommuting not available|travel not required,,please see job description,3941b2f206ae0f900c4fba4ac0b18719,java developerfull time direct hirebolingbrook...,please see job description
4,https://www.dice.com/jobs/detail/devops-engine...,matrix resources,full time,midtown based high tech firm has an immediate ...,matrixga,"atlanta, ga",devops engineer,48 minutes ago,telecommuting not available|travel not required,,"configuration management, developer, linux, ma...",45efa1f6bc65acc32bbbb953a1ed13b7,midtown based high tech firm immediate need in...,configuration management developer linux ma...
5,https://www.dice.com/jobs/detail/sap-fico-arch...,yash technologies,"full time, permanant",we are looking for a senior sap fico architect...,10111847,"chicago, il",sap fico architect,2 weeks ago,telecommuting not available|travel required to...,,"fico, ar, ap, asset management, haha",e0ac9d926dda5e95162ef05adea7318c,looking senior sap fico architect join us full...,fico ar ap asset management haha
6,https://www.dice.com/jobs/detail/network-engin...,noble1,"full time, direct hire",network engineer job description a network eng...,90884761,"atlanta, ga",network engineer,1 hour ago,telecommuting not available|travel not required,,"cisco, dns, http, networking, network engineer...",e7e326053c586bd94e59f1fd74de4a1b,network engineer job description network engin...,cisco dns http networking network engineer...
7,https://www.dice.com/jobs/detail/sr.-web-appli...,"bluebeam software, inc.","full time, full time",bluebeam is looking for talented sr. web devel...,10110132,"chicago, il",sr. web application developer (cloud team) - c...,2 weeks ago,telecommuting not available|travel not required,,".net , c#, mvc, restful web services, http, aw...",b0dadecf4c3c2beecb9c773ca11ecda4,bluebeam looking talented sr web developers p...,net c# mvc restful web services http aw...
8,https://www.dice.com/jobs/detail/front-end-dev...,genesis10,"full time, direct placement",this is a fulltime position for a javascript d...,gentx001,"new york, ny",front end developer,7 hours ago,telecommuting not available|travel not required,,"c++, developer, development, javascript, user ...",28f5e0c1cc3314813e674f0c32b04d1b,fulltime position javascript developer financi...,c developer development javascript user ...
9,https://www.dice.com/jobs/detail/application-s...,"vanderhouwen & associates, inc.","c2h w2, contract to hire",summaryour client is the leading provider of o...,vhassoc,"seattle, wa",application support engineer,7 hours ago,telecommuting not available|travel not required,,(see job description),95c9127e2770172f454f3b83981eaa88,summaryour client leading provider online vide...,see job description


In [34]:
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(2, 3),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000
                     )

item_ids = CSV_file2_df['uniq_id'].tolist()
tfidf_matrix = vectorizer.fit_transform(CSV_file2_df['Job_Skills_Without_Stopwords']+ "" + CSV_file2_df['Job_Description_Without_Stopwords'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<22000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1273507 stored elements in Compressed Sparse Row format>

In [90]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(user_id, rating_indexed_df):
    rating_person_df = rating_indexed_df.loc[user_id]
    user_item_profiles = get_item_profiles(rating_person_df['uniq_id'])
    
    user_item_strengths = np.array(rating_person_df['eventStrength']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    rating_indexed_df = rating_full_df[rating_full_df['uniq_id'] \
                                                   .isin(CSV_file2_df['uniq_id'])].set_index('user_id')
    user_profiles = {}
    for user_id in rating_indexed_df.index.unique():
        user_profiles[user_id] = build_users_profile(user_id, rating_indexed_df)
    return user_profiles

In [91]:
user_profiles = build_users_profiles()
len(user_profiles)

1209

In [92]:
myprofile = user_profiles[100118]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[100118].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 5000)


Unnamed: 0,token,relevance
0,experience using,0.16834
1,business objects,0.121086
2,years experience,0.120097
3,account manager,0.110196
4,development experience,0.109406
5,ideal candidate,0.107328
6,years demonstrated,0.104136
7,programming experience,0.104047
8,rest soap,0.09923
9,big data,0.093637


In [60]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=10):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['uniq_id', 'eventStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'uniq_id', 
                                                          right_on = 'uniq_id')['eventStrength', 'uniq_id']


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(CSV_file2)

NameError: name 'CSV_file2' is not defined

In [106]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['uniq_id', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('items_df is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'uniq_id', 
                                                          right_on = 'uniq_id')[['eventStrength', 'uniq_id']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(CSV_file2_df)

In [107]:

cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)

cb_detailed_results_df.head(10)

1208 users processed


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
56,100003,8,7,10,0.8,0.7
1037,100138,7,7,10,0.7,0.7
351,101190,8,8,10,0.8,0.8
14,100772,9,6,10,0.9,0.6
107,100559,7,7,10,0.7,0.7
329,100642,7,7,9,0.777778,0.777778
933,100824,7,7,9,0.777778,0.777778
124,100672,8,8,9,0.888889,0.888889
310,100411,5,5,9,0.555556,0.555556
311,100190,8,8,9,0.888889,0.888889


In [None]:
non_interacted_items_sample