In [95]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [96]:
import json
import re

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed

In [106]:
# Initialize variables
count_vec = CountVectorizer()
cached_stopwords = stopwords.words("english")

input_file = 'db/repos' # The file to load the user's data

In [98]:
# Utils function
def xstr(s):
    if s is None:
        return ''
    return s

def load_data(input_file):
    with open(input_file, 'r') as f:
        return [json.loads(line) for line in f]

In [107]:

# Load data
data = load_data(input_file)
data_len = len(data)
print('loaded {} repos'.format(data_len))

loaded 42817 repos


In [108]:

items = {} # Dictionary mapping for all the items (repos)
users = {} # Dictionary mapping for the user

def populate_data(users, items, item):
    login = item['owner']['login']
    
    # Cache the user's data
    if users.get(login) is None:
        users[login] = item['owner']

    # Cache the repo's data
    if items.get(login) is None:
        items[login] = []

    # Get the relevant fields
    name = xstr(item['name'])
    description = xstr(item['description'])
    language = xstr(item['language'])
    repo = 'uniquerepoidentifierx00' # Use this unique keyword as the repo count

    # Get the whole text for matching
    keywords = ' '.join([name, description, language, repo])
    
    # Remove special characters
    keywords = re.sub('\W+', ' ', keywords)

    # Convert to lowercase and split by white spaces
    keywords = keywords.lower().split(' ') 

    # Remove empty string
    keywords = [key for key in keywords if key != ''] 
    
    # Remove stopwords
    keywords = [key for key in keywords if key not in cached_stopwords]

    # Second tuple
    [items[login].append(key) for key in keywords]

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(populate_data, users, items, item) for item in data]
    for future in as_completed(futures):
        future.result()

# Handle the count vector first
out = count_vec.fit_transform([' '.join(items[item]) 
                               for key, item in enumerate(items)]).toarray()

print('Generated output vector')

Generated output vector


In [116]:

logins = [item for key, item in enumerate(items)]

def compute_scores(user1, items, out):
    scores = {}
    user1_index = logins.index(user1)
    for user2_index, user2 in enumerate(items):
        # Calculate score against other user only
        if user1 != user2:
            # Compute similarity scores
            score = cosine_similarity([out[user1_index]], [out[user2_index]]).flatten()[0]
        
        if scores.get(user1) is None:  
            scores[user1] = []
        scores[user1].append((user2, score))
    return scores

def rank_scores(scores):
    ranks = []
    for i, user in enumerate(scores):
        top_n = sorted(scores[user], key=lambda tup: tup[1], reverse=True)[:8]
        non_zero = [(user, score) for (user, score) in top_n if score != 0]
        if (len(non_zero) > 0):
            ranks.append((user, non_zero))
    return ranks


scores = compute_scores('piyushchauhan2011', items, out)
ranks = rank_scores(scores)
print('ranks:', ranks)
    

ranks: [('piyushchauhan2011', [('satnami', 0.90832321636825097), ('geoyws', 0.89828810631359146), ('kanasite', 0.89718458519748157), ('cchongXD', 0.89596526897386686), ('lbthomsen', 0.89540610021274669), ('dhilip89', 0.89514675690099588), ('ethanliew', 0.89277611401200141), ('OskarAhl', 0.89262847200753215)])]
