Collaborative filters are based mainly in 2 different datasets: content and rating. in this case, the content matrix is filled with the job candidates data and the rating the one done by the different companies.

In [1]:
import pandas as pd
import numpy as np
from time import sleep

COMPANIES = range(100)
CANDIDATES = 5000
MAX_NUM_EMPLOYEES = 500
MAX_SCORE = 10



In [15]:
ratings = []

for company in COMPANIES:
    size = np.random.randint(MAX_NUM_EMPLOYEES)
    candidates_to_rank = np.random.randint(CANDIDATES, size=size)
    
    for candidate in candidates_to_rank:
        score = np.random.randint(MAX_SCORE+1)
        ratings.append([company, candidate, score])
        
ratings = pd.DataFrame(ratings, columns=['companyId', 'candidateId', 'score'])

In [16]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25672 entries, 0 to 25671
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   companyId    25672 non-null  int64
 1   candidateId  25672 non-null  int64
 2   score        25672 non-null  int64
dtypes: int64(3)
memory usage: 601.8 KB


In [17]:
ratings.head()

Unnamed: 0,companyId,candidateId,score
0,0,2182,10
1,0,1578,6
2,0,3676,9
3,0,4855,6
4,0,2868,7


In [19]:
input_candidates = []

size = np.random.randint(MAX_NUM_EMPLOYEES)
candidates_to_rank = np.random.randint(CANDIDATES, size=size)
    
for candidate in candidates_to_rank:
    score = np.random.randint(MAX_SCORE+1)
    input_candidates.append([candidate, score])
    
input_candidates = pd.DataFrame(input_candidates, columns=['candidateId', 'score'])
                                
input_candidates.head()

Unnamed: 0,candidateId,score
0,4824,2
1,422,6
2,2541,5
3,2919,9
4,716,6


In [22]:
companies_subset = ratings[ratings['candidateId'].isin(input_candidates['candidateId'].tolist())]
companies_subset.head()

Unnamed: 0,companyId,candidateId,score
61,0,1406,2
104,0,2919,7
151,0,1524,8
289,1,2752,7
291,1,1387,5


In [23]:
companies_subset_group = companies_subset.groupby(['companyId'])
companies_subset_group.head()

Unnamed: 0,companyId,candidateId,score
61,0,1406,2
104,0,2919,7
151,0,1524,8
289,1,2752,7
291,1,1387,5
...,...,...,...
25061,98,3727,6
25121,98,4985,7
25383,99,3460,8
25437,99,2533,0


In [24]:
companies_subset_group.get_group(77)

Unnamed: 0,companyId,candidateId,score
19495,77,1977,4
19649,77,1292,8
19687,77,1292,9
19704,77,4721,6
19744,77,946,9
19832,77,2033,9
19833,77,3089,6
19852,77,392,5
19876,77,1406,9


In [26]:
#Sort by companies which shares the most candidates in common
companies_subset_group = sorted(companies_subset_group,  key=lambda x: len(x[1]), reverse=True)
companies_subset_group[0:3]

[(27,
        companyId  candidateId  score
  6656         27         2387      0
  6694         27         1682      2
  6699         27         2402      0
  6702         27         2533      2
  6817         27         4447      2
  6819         27          876      6
  6846         27           36      6
  6861         27         2875      7
  6955         27         1292      6
  6977         27         4677      7
  6985         27          504      2
  6994         27         2272      4
  7031         27         1071      5),
 (43,
         companyId  candidateId  score
  11277         43          788      8
  11318         43         3460      9
  11328         43         2731      4
  11377         43         3089      5
  11421         43         4447      7
  11435         43         4447      1
  11447         43          504      1
  11449         43         2415      0
  11530         43         1071      3
  11562         43         1524     10
  11583         43       

In [30]:
#TODO: fix hardcoding
if len(companies_subset_group) > 100:
    companies_subset_group = companies_subset_group[0:100]

89

In [34]:
companies_subset_group[0][1]

Unnamed: 0,companyId,candidateId,score
6656,27,2387,0
6694,27,1682,2
6699,27,2402,0
6702,27,2533,2
6817,27,4447,2
6819,27,876,6
6846,27,36,6
6861,27,2875,7
6955,27,1292,6
6977,27,4677,7


In [52]:
def make_pearson_correlation(companies_subset_group, input_candidates):
    pearsonCorrelationDict = {}

    #For every copmany group in our subset
    for company, group in companies_subset_group:
        #Let's start by sorting the input and current copmany group so the values aren't mixed up later on
        group = group.sort_values(by='candidateId')
        input_candidates = input_candidates.sort_values(by='candidateId')

        #Get the N for the formula
        nRatings = len(group)
        #Get the review scores for the movies that they both have in common
        temp_df = input_candidates[input_candidates['candidateId'].isin(group['candidateId'].tolist())]
        #And then store them in a temporary buffer variable in a list format to facilitate future calculations
        tempRatingList = temp_df['score'].tolist()
        #Let's also put the current user group reviews in a list format
        tempGroupList = group['score'].tolist()
        #Now let's calculate the pearson correlation between two users, so called, x and y
        Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
        Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
        Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

        #If the denominator is different than zero, then divide, else, 0 correlation.
        if Sxx != 0 and Syy != 0:
            pearsonCorrelationDict[company] = Sxy/sqrt(Sxx*Syy)
        else:
            pearsonCorrelationDict[company] = 0
    
    return pearsonCorrelationDict

In [53]:
pearsonCorrelationDict = make_pearson_correlation(companies_subset_group, input_candidates)
pearsonCorrelationDict.items()

dict_items([(27, -0.5734424570726478), (43, 0.37076553088999664), (19, -0.23312794006336665), (22, -0.3396801531215123), (36, 0.293987366103667), (48, 0.17673105783742887), (57, -0.3184477711353141), (72, -0.7520956410372819), (77, 0.5200931989445984), (97, -0.22831482556870486), (11, -0.2396406105226645), (17, -0.114798533159634), (24, -0.8043644593020609), (31, 0.1562957964935827), (25, -0.39142658862397905), (34, 0.3773758785284235), (83, 0.5291612750491624), (94, -0.2802789026481287), (9, -0.25252539998521123), (35, -0.5052911526399114), (41, 0.3594955927075126), (58, -0.38077233682843376), (62, 0.07906450543478173), (66, -0.7145854896251936), (71, -0.874173383533045), (2, -0.7996002997502185), (6, 0.4486917274531548), (29, 0.035986126423960874), (32, -0.7518621795170318), (33, -0.3254416407508151), (38, -0.5282385868580226), (45, -0.20049160254417145), (47, 0.16222142113076254), (49, -0.011496531444875993), (80, 0.24112141108520602), (89, -0.536461273228943), (95, -0.4146896872849

In [41]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['companyId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.sort_values('similarityIndex', ascending=False).head(20)

Unnamed: 0,similarityIndex,companyId
74,1.0,96
65,1.0,30
68,1.0,60
71,1.0,78
73,1.0,84
42,0.938343,37
44,0.903767,50
53,0.751729,13
37,0.622821,98
50,0.592078,90


In [42]:
top_companies = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
top_companies.head()

Unnamed: 0,similarityIndex,companyId
74,1.0,96
65,1.0,30
68,1.0,60
71,1.0,78
73,1.0,84


In [43]:
top_companies_rating = top_companies.merge(ratings, left_on='companyId', right_on='companyId', how='inner')
top_companies_rating.head()

Unnamed: 0,similarityIndex,companyId,candidateId,score
0,1.0,96,2350,8
1,1.0,96,1861,0
2,1.0,96,4870,4
3,1.0,96,2062,8
4,1.0,96,60,3


In [44]:
top_companies_rating['weightedScore'] = top_companies_rating['similarityIndex']*top_companies_rating['score']
top_companies_rating.head()

Unnamed: 0,similarityIndex,companyId,candidateId,score,weightedScore
0,1.0,96,2350,8,8.0
1,1.0,96,1861,0,0.0
2,1.0,96,4870,4,4.0
3,1.0,96,2062,8,8.0
4,1.0,96,60,3,3.0


In [46]:
temp_top_companies_rating = top_companies_rating.groupby('candidateId').sum()[['similarityIndex','weightedScore']]
temp_top_companies_rating.columns = ['sum_similarityIndex','sum_weightedScore']
temp_top_companies_rating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedScore
candidateId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.785043,6.228211
1,1.0,3.0
2,1.047119,6.654693
3,-0.011497,-0.103469
4,2.214899,9.736625


In [47]:
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = temp_top_companies_rating['sum_weightedScore']/temp_top_companies_rating['sum_similarityIndex']
recommendation_df['candidateId'] = temp_top_companies_rating.index
recommendation_df.head()

Unnamed: 0,weighted average recommendation score,candidateId
0,7.933597,0
1,3.0,1
2,6.35524,2
3,9.0,3
4,4.395967,4


In [48]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0,weighted average recommendation score,candidateId
1310,271.206563,1310
4542,206.654922,4542
4707,166.374903,4707
3105,148.888803,3105
1249,131.402703,1249
4901,102.06362,4901
2135,81.719684,2135
1579,67.357568,1579
3073,63.229377,3073
3369,55.03181,3369


In [57]:
# Check if there are recommended candidates in the input dataframe
input_candidates.loc[input_candidates.candidateId == 4707]

Unnamed: 0,candidateId,score
