In [95]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [1]:
import json
import re
from math import *

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Initialize variables
count_vec = CountVectorizer()
cached_stopwords = stopwords.words("english")

input_file = 'db/repos' # The file to load the user's data

In [3]:
# Utils function
def xstr(s):
    if s is None:
        return ''
    return s

def load_data(input_file):
    with open(input_file, 'r') as f:
        return [json.loads(line) for line in f]

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def process_keyword(keywords):
    # Remove special characters
    keywords = re.sub('\W+', ' ', keywords)
    
    # Split string at uppercase
    uppercase = re.findall('[A-Z][^A-Z]*', keywords)
    if len(uppercase) > 0:
        keywords = ' '.join(uppercase)
        # If everything is lowercase, it will return empty array

    # Convert to lowercase and split by white spaces
    keywords = keywords.lower().split(' ') 

    # Remove empty string
    keywords = [key for key in keywords if key != '']
    
    # Remove stopwords
    keywords = [key for key in keywords if key not in cached_stopwords]
    return keywords

In [4]:
# Load data
data = load_data(input_file)
data_len = len(data)
print('loaded {} repos'.format(data_len))

loaded 42817 repos


In [5]:

items = {} # Dictionary mapping for all the items (repos)
users = {} # Dictionary mapping for the user

def populate_data(users, items, item):
    login = item['owner']['login']
    
    # Cache the user's data
    if users.get(login) is None:
        users[login] = item['owner']

    # Cache the repo's data
    if items.get(login) is None:
        items[login] = []
    if item['fork']:
        return
    # Get the relevant fields
    name = xstr(item['name'])
    description = xstr(item['description'])
    language = xstr(item['language'])
    repo = 'uniquerepoidentifierx00' # Use this unique keyword as the repo count

    # Get the whole text for matching
    keywords = ' '.join([name, description, language, repo])
    
    keywords = process_keyword(keywords)
    
    # Second tuple
    [items[login].append(key) for key in keywords]

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(populate_data, users, items, item) for item in data]
    for future in as_completed(futures):
        future.result()

# Handle the count vector first
out = count_vec.fit_transform([' '.join(items[item]) 
                               for key, item in enumerate(items)]).toarray()

jaccard_scores = [items[item] for key, item in enumerate(items)]
print('Generated output vector')

Generated output vector


In [6]:

logins = [item for key, item in enumerate(items)]

def compute_scores(user1, items, out):
    scores = {}
    user1_index = logins.index(user1)
    for user2_index, user2 in enumerate(items):
        # Calculate score against other user only
        if user1 != user2:
            # Compute similarity scores
            score1 = cosine_similarity([out[user1_index]], [out[user2_index]]).flatten()[0]
            score2 = jaccard_similarity(jaccard_scores[user1_index], jaccard_scores[user2_index])
            score = score1 + score2
        
        if scores.get(user1) is None:  
            scores[user1] = []
        scores[user1].append((user2, score))
    return scores

def rank_scores(scores):
    ranks = []
    for i, user in enumerate(scores):
        top_n = sorted(scores[user], key=lambda tup: tup[1], reverse=True)[:8]
        non_zero = [(user, score) for (user, score) in top_n if score != 0]
        if (len(non_zero) > 0):
            ranks.append((user, non_zero))
    return ranks


scores = compute_scores('alextanhongpin', items, out)
ranks = rank_scores(scores)
print('ranks:', ranks)
    

ranks: [('alextanhongpin', [('TimurKiyivinski', 1.0252474478872371), ('scr1p7ed', 1.00483163785239), ('erikdubbelboer', 0.99467451581015698), ('FeliciousX', 0.99006737654851718), ('geoyws', 0.98220280375288205), ('Mwarukason', 0.98121602678842335), ('winfredselwyn', 0.97800580466345788), ('rahman541', 0.97473793194589864)])]


In [7]:
# Predict similarity between keywords and the items in the dataset

def search_keyword(keyword, out):
    X = count_vec.transform(process_keyword(keyword))

    scores = [(i, cosine_similarity(X, [value]).flatten()[0])
              for i, value in enumerate(out)]

    top_10_similar = sorted(scores, 
                            key=lambda tup: tup[1], 
                            reverse=True)[:10]
    non_zero_scores = [(logins[index], score) 
                       for (index, score) in top_10_similar 
                       if score != 0]
    return non_zero_scores

keywords = {}
def cached_search_keyword(keyword, keywords, out):
    if keywords.get(keyword) is None:
        keywords[keyword] = search_keyword(keyword, out)
    return keywords[keyword]


In [None]:
# new_set = set([])

# for keyword in jaccard_scores:
#     for key in keyword:
#         new_set.add(key) 

# new_list = list(new_set)

# Train the cache lol
# for key in new_list:
#     print('caching', key)
#     cached_search_keyword(key, keywords, out)

# Cache parallel
# with ThreadPoolExecutor() as executor:
#     futures = [executor.submit(cached_search_keyword, key, keywords, out) for key in new_list]
#     for future in as_completed(futures):
#         print(future.result())

[('rickysoo', 0.57735026918962584), ('jad5494', 0.5), ('GuubsFlow', 0.5), ('maelpengerang', 0.30151134457776363), ('hydertech', 0.23570226039551587), ('ruhaizat', 0.15430334996209191), ('kamudrikah', 0.11396057645963795), ('baimhanifkamil', 0.059028133610095526), ('jk-gan', 0.051969970033474297), ('wilz5363', 0.051164451009665081)]
[('purnima23', 0.028501713717057404)]
[('izambasiron', 0.1690308509457033)]
[('jktan0504', 0.22360679774997896)]
[('shahril96', 0.035944257734479471), ('SalocinDotTEN', 0.035759926992607577)]
[('JoeSee', 0.36514837167011072), ('cafreyma', 0.34299717028501764), ('almez', 0.27617238536949701), ('Zulox', 0.16064386578049977), ('jwchong93', 0.087038827977848926), ('eileenwong9305', 0.080845208345444328), ('nazebzurati', 0.068358592702466331), ('cshong0618', 0.066964953018242512), ('YiiKuoChong', 0.062622429108514954), ('roninprogrammer', 0.059976014390406722)]
[('cincauhangus', 0.2581988897471611), ('HazeWatchApp', 0.19802950859533489), ('sinclair83', 0.18257418

In [None]:
cached_search_keyword('trace', keywords, out)