In [1]:
# frequently used libraries
import numpy as np
import pandas as pd

In [2]:
# read user info and project info data entered by users
user_interest_df = pd.read_csv('user_interest_df.csv', index_col=0)
user_interest_df.head()

Unnamed: 0,user_id,interest_id,rating,experience
0,0,4,1,1
1,0,2,4,2
2,1,1,1,2
3,1,4,2,1
4,1,3,2,3


In [3]:
project_interest_df = pd.read_csv('project_interest_df.csv', index_col=0)
project_interest_df.head()

Unnamed: 0,project_id,interest_id,rating,experience
0,0,2,3,2
1,0,1,3,1
2,1,2,1,2
3,1,4,1,2
4,1,3,2,1


In [4]:
user_interest_series = user_interest_df.groupby('user_id')['interest_id'].agg(lambda x: set(x))
user_interest_series

user_id
0           {2, 4}
1     {1, 2, 3, 4}
2     {1, 2, 3, 4}
3           {2, 4}
4           {1, 2}
5     {1, 2, 3, 4}
6              {2}
7        {1, 2, 4}
8     {1, 2, 3, 4}
9        {1, 3, 4}
10    {1, 2, 3, 4}
11    {1, 2, 3, 4}
12    {1, 2, 3, 4}
13          {1, 2}
14             {2}
15             {3}
16             {1}
17             {2}
18    {1, 2, 3, 4}
19       {1, 3, 4}
Name: interest_id, dtype: object

In [5]:
# pivot tables to show ratings for each interest in columns suitable for similarity analysis
user_rating_df = user_interest_df.pivot(index='user_id', columns='interest_id', values='rating')
user_rating_df.fillna(0.0, inplace=True)
user_rating_df

interest_id,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,4.0,0.0,1.0
1,1.0,3.0,2.0,2.0
2,3.0,3.0,1.0,4.0
3,0.0,2.0,0.0,2.0
4,1.0,1.0,0.0,0.0
5,3.0,4.0,2.0,3.0
6,0.0,4.0,0.0,0.0
7,1.0,4.0,0.0,2.0
8,3.0,2.0,4.0,3.0
9,1.0,0.0,4.0,4.0


In [6]:
project_rating_df = project_interest_df.pivot(index='project_id', columns='interest_id', values='rating')
project_rating_df.fillna(0.0, inplace=True)
project_rating_df

interest_id,1,2,3,4
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.0,3.0,0.0,0.0
1,4.0,1.0,2.0,1.0
2,4.0,0.0,0.0,2.0
3,2.0,2.0,2.0,2.0
4,3.0,0.0,0.0,0.0
5,3.0,2.0,1.0,4.0
6,4.0,0.0,1.0,3.0
7,0.0,0.0,3.0,0.0
8,2.0,0.0,0.0,1.0
9,4.0,1.0,1.0,1.0


In [7]:
# set of interests for each project
project_interest_series = project_interest_df.groupby('project_id')['interest_id'].agg(lambda x: set(x))
project_interest_series

project_id
0          {1, 2}
1    {1, 2, 3, 4}
2          {1, 4}
3    {1, 2, 3, 4}
4             {1}
5    {1, 2, 3, 4}
6       {1, 3, 4}
7             {3}
8          {1, 4}
9    {1, 2, 3, 4}
Name: interest_id, dtype: object

In [8]:
user = 5

In [9]:
# libraries for sparse matrix and cosine similarity 
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# dataframe of similar projects and similarity value for sorting 
similar_projects = pd.DataFrame(columns=['project', 'similarity'])
      
# go through all projects in database
for project in project_rating_df.index:
                
    # calculate similarity of user interest with project interest
    user_rating_sparse = csr_matrix(user_rating_df.iloc[user, :].values)
    project_rating_sparse = csr_matrix(project_rating_df.iloc[project, :].values)
    similarity = cosine_similarity(user_rating_sparse, project_rating_sparse)
    similar_projects.loc[len(similar_projects)] = [project, similarity[0][0]]

# sort projects based on similarity values 
sorted_projects_float = similar_projects.sort_values(by='similarity', ascending=False)['project'].tolist()
recommended_projects = list(map(int, sorted_projects_float))
recommended_projects

[3, 5, 0, 1, 9, 6, 2, 8, 4, 7]

In [10]:
# subset of similar projects compatible with user skill level 
for project in recommended_projects:
    for interest in project_interest_series.iloc[project]:
        required_skills = (project_interest_df['project_id']==project) & (project_interest_df['interest_id']==interest)
        qualified_skill = (user_interest_df['user_id']==user) & (user_interest_df['interest_id']==interest)
        if project_interest_df[required_skills].experience.values > user_interest_df[qualified_skill].experience.values:
            recommended_projects.remove(project)
            break
            
recommended_projects

[5, 1, 9, 2, 8, 4, 7]