In [1]:
%load_ext autoreload
%autoreload 2
# import sys, os
# sys.path.append('../')

from app import db
from app.models import User, Project, ProjMember, JoinRequest, ScrumTask, Tag, Position, proj_categories, \
                            Learning #Project subclasses
from app import create_app

app = create_app()
app.app_context().push()

# Matching users with projects, and vice versa

Condensing projects, and users, into machine interpretable matrices.

## Text data
* Projects
 - Description (string)
 - Tags (array of strings)
    
* Users
 - Interests (array)
 - About me/goals (string)
 - Should also incorporate liked/followed projects and other users
 
Without machine learning can use Content based filtering to overlap tags and recommend projects based on that. Two approaches

- Jaccard Index
- Cosine Similarity (content embeddings) 
    - this is probably what will be used for the description strings as well
    
Users should have a relation table for whether or not project should show up in recommended
- False flag if user is already member, or if they click a 'hide recommendation' button

[Great resource for this](https://github.com/JohnsonKuan/movie-rec-tags/blob/master/Movie-Rec-Movielens-Tags.ipynb)

In [42]:
def prepare_data(model, id_, tags_only):
    """Prepare data for a more semantic model"""
    inst = model.query.get(id_)
    if model == Project:
        description = inst.descr
    else:
        description = inst.about_me
    return description #TAGS: {' '.join(tags)}"
    
def jaccard_index(*args):
    """takes in two lists"""
    l1, l2 = map(set, args)
    return len(l1 & l2) / len(l1 | l2)

def cbf(user, func):
    """performs collaborative based filtering 
    func: either jaccard_index or cosine_similarity
    
    returns list of projects ordered by similarity"""
    user_tags = [t.name for t in user.tags]
    projects = Project.query.all()
    projects = {p.id : [t.name for t in p.tags] for p in projects}
    scores = {p.id : 0 for p in projects}
    
    for k,v in projects:
        scores[k] = func(user_tags,v)
    
    sorted_scores = list(scores.values()).sort(reverse=True)
    sorted_projs = [{v:k for k,v in scores}[s] for s in sorted_scores]
    return sorted_projs
    

In [29]:
print(jaccard_index([1,2,4,3,1],[45,34,23,1]), jaccard_index([1,1,2,2],[1,1,2,2]))

0.14285714285714285 1.0
