In [12]:
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

cached_stopwords = stopwords.words("english")

In [33]:
count_vec = CountVectorizer()

def xstr(s):
    if s is None:
        return ''
    return s

items = {}
users = {}
input_file = 'db/repos'

with open(input_file, 'r') as f:
    data = [json.loads(line) for line in f]
    print('loaded {} repos'.format(len(data)))
    for item in data:
        login = item['owner']['login']
        if users.get(login) is None:
            users[login] = item['owner']

        name = xstr(item['name'])
        description = xstr(item['description'])
        language = xstr(item['language'])
        repo = 'uniquerepoidentifierx00' # Use this unique keyword as the repo count
        
        if items.get(login) is None:
            items[login] = []
        keywords = name + ' ' + description + ' ' + language + ' ' + repo
        
        X_text = re.sub('\W+', ' ', keywords).lower().split(' ')
        X_text = [text for text in X_text if text not in cached_stopwords and text != '']
        # Second tuple
        [items[login].append(txt) for txt in X_text]

# Handle the count vector first
out = count_vec.fit_transform([' '.join(items[item]) 
                               for key, item in enumerate(items)]).toarray()

loaded 42817 repos


In [32]:
scores = {}
for i, user1 in enumerate(items):
    if user1 != 'alextanhongpin':
        continue
    for j, user2 in enumerate(items):
        if user1 is not user2:
            score = cosine_similarity([out[i]], [out[j]]).flatten()[0]
#             score = 1 / (1 + euclidean_distances([out[i]], [out[j]])[0][0])
            if scores.get(user1) is None:
                scores[user1] = []
#             print('similarity between user {} and {} is {}'.format(user1, user2, score))
            scores[user1].append((user2, score))

for i, user in enumerate(scores):
    top_n = sorted(scores[user], key=lambda tup: tup[1], reverse=True)[:8]
    non_zero = [(user, score) for (user, score) in top_n if score != 0]
    if (len(non_zero) > 0):
        print(user, non_zero)

alextanhongpin [('erikdubbelboer', 0.88967775278982941), ('roylee0704', 0.88443938989526938), ('cmeon', 0.86537505937492742), ('satnami', 0.85712107806647686), ('scr1p7ed', 0.85589057248007638), ('piyushchauhan2011', 0.85433961766282274), ('dhilip89', 0.8533431314867167), ('cchongXD', 0.8513853588530097)]


In [34]:
print(users['alextanhongpin'])

{'login': 'alextanhongpin', 'id': 6033638, 'avatar_url': 'https://avatars3.githubusercontent.com/u/6033638?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/alextanhongpin', 'html_url': 'https://github.com/alextanhongpin', 'followers_url': 'https://api.github.com/users/alextanhongpin/followers', 'following_url': 'https://api.github.com/users/alextanhongpin/following{/other_user}', 'gists_url': 'https://api.github.com/users/alextanhongpin/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/alextanhongpin/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/alextanhongpin/subscriptions', 'organizations_url': 'https://api.github.com/users/alextanhongpin/orgs', 'repos_url': 'https://api.github.com/users/alextanhongpin/repos', 'events_url': 'https://api.github.com/users/alextanhongpin/events{/privacy}', 'received_events_url': 'https://api.github.com/users/alextanhongpin/received_events', 'type': 'User', 'site_admin': False}
