In [16]:
import random
from bson.objectid import ObjectId
import json

#set the random seed for reproducibility
random.seed(42)

def generate_random_object_id():
    """Generate a random ObjectId."""
    return str(ObjectId())

In [14]:
from datetime import datetime
import bcrypt

def generate_random_version():
    """Generate a random version string."""
    major = random.randint(0, 9)
    minor = random.randint(0, 9)
    patch = random.randint(0, 9)
    return f"{major}.{minor}.{patch}"

def generate_random_description (file_content):
    # search for README.md in file_content
    if "README.md" in file_content:
        return file_content["README.md"][:100]  # Return first 100 characters of README.md content

    if "readme.md" in file_content:
        return file_content["readme.md"][:100]

    for key in file_content:
        if key.lower().endswith('readme.md'):
            return file_content[key][:100]
    
    for key in file_content:
        if key.lower().endswith('.md'):
            return file_content[key][:100]

    return "This is a randomly generated description."

def convert_date(date_str):
    # convert from git date() Tue Jun 27 13:10:39 2023 +0800) to mongodb date format
    
    # Convert to MongoDB Extended JSON date format
    dt = datetime.strptime(date_str, "%a %b %d %H:%M:%S %Y %z")
    return dt.isoformat()

def convert_date_issue(date_str):
    # convert from date (2025-05-26T11:15:46Z) to mongodb date format
    dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    return dt.isoformat()

def get_stats(stats):

    file_changed = len(stats.split("\n")) - 1
    stats = stats.split("\n")[-1].strip()
    try:
        insertions = int(stats.split(' insertion')[0].split(' ')[-1])
    except Exception as e:
        insertions = 0
    try:
        deletions = int(stats.split(' deletion')[0].split(' ')[-1])
    except Exception as e:
        deletions = 0
    return file_changed, insertions, deletions

def get_size_kb(obj):
    return len(json.dumps(obj)) / 1024

def return_bycript_password(password):
    """Return a bcrypt hashed password."""
    salt = bcrypt.gensalt()
    hashed = bcrypt.hashpw(password.encode('utf-8'), salt)
    return hashed.decode('utf-8')

In [50]:
get_stats(".../src/wrapper-validation/wrapper-checksums.json  | 28 ++++++++++++++++++++++\n 1 file changed, 28 insertions(+)")

(1, 28, 0)

In [17]:
import json
import os
import time
import sys


'''
private String username; (_id)
private String password;
private String email;
private String name;
private String surname;
private String nationality;
private Integer followerNumber;
private Integer followingNumber;
private List<String> comments;
private List<String> projects;
private Boolean isAdmin;
private Instant registrationDate;
'''
users = set()

'''
private String id;
private String name;
private String description;
private String owner;
private String version;
private Instant creationDate;
private List<String> administrators;
private Set<String> fileIds;
'''
project = []

'''
private String id;

// redundant fields to support index, not join operations
private String owner;
private String projectName;

private String path;
private String type;
private Integer size;
private Integer lines;
private Instant lastModified;
private String lastModifiedBy;
private String content;
'''
file = []

'''
private String id;

    @Indexed(name = "idx_commits_author")
    private String author;

    private Integer linesAdded;
    private Integer linesDeleted;
    private Integer filesModified;
    private Instant timestamp;
'''
commit = []

'''
    @JsonProperty("_id")
    private String id;

    private String username;
    private Integer stars;
    private String text;
    private Instant timestamp;
'''
comment = []

# print("Loading repository details from all_repo_details.json...")
# with open('all_repo_details.json', 'r') as f:
#    all_repo_details = json.load(f)

# print("Repository details loaded successfully.")

json_files = [f for f in os.listdir('downloaded_repos') if f.endswith('.json')]

all_repo_details = []
for file_name in json_files:
    all_repo_details.append(file_name[:-5].replace('__', '/'))

j = 0
user_ids_linking = {}
for repo_full_name in all_repo_details:

    print(f"Processing repository {j+1}/{len(all_repo_details)}: {repo_full_name}")
    
    if j % 100 == 0 and j > 0:
        print(f"files: {get_size_kb(file):.1f} KB, commits: {get_size_kb(commit):.1f} KB, comments: {get_size_kb(comment):.1f} KB, projects: {get_size_kb(project):.1f} KB")

    with open(f'downloaded_repos/{json_files[j]}', 'r') as f:
        repo_details = json.load(f)

    j += 1

    if len(repo_details['file_contents']) == 0 or len(repo_details['commits']) == 0 or len(repo_details['contributors']) == 0:
        print(f"Skipping repository {repo_full_name} due to no file contents.")
        continue


    contributors = [c['login'] for c in repo_details['contributors']]
    for c in contributors:
        users.add(c)

    file_ids = []

    repo_id = generate_random_object_id()
    repo_owner = repo_full_name.split('/')[0]
    project_name = repo_full_name.split('/')[-1]

    #analyze commits
    commit_ids = []
    for i in range(len(repo_details['commits'])):
        if len(contributors) == 0:
            repo_details['commits'][i]['author'] = repo_owner
        else:
            repo_details['commits'][i]['author'] = random.choice(contributors)
        repo_details['commits'][i]['id'] = generate_random_object_id()
        file_modified, lines_added, lines_deleted = get_stats(repo_details['commits'][i]['stats'])
        repo_details['commits'][i]['linesAdded'] = lines_added
        repo_details['commits'][i]['linesDeleted'] = lines_deleted
        repo_details['commits'][i]['filesModified'] = file_modified
        repo_details['commits'][i]['timestamp'] = convert_date(repo_details['commits'][i]['date'])
        
        commit.append({
            '_id': repo_details['commits'][i]['id'],
            'author': repo_details['commits'][i]['author'],
            'linesAdded': repo_details['commits'][i]['linesAdded'],
            'linesDeleted': repo_details['commits'][i]['linesDeleted'],
            'filesModified': repo_details['commits'][i]['filesModified'],
            'timestamp': repo_details['commits'][i]['timestamp']
        })

        if repo_details['commits'][i]['author'] not in user_ids_linking:
            user_ids_linking[repo_details['commits'][i]['author']] = {'commitIds': [], 'projectIds': [], 'commentIds': []}
        user_ids_linking[repo_details['commits'][i]['author']]['commitIds'].append(repo_details['commits'][i]['id'])


        commit_ids.append(repo_details['commits'][i]['id'])


    repo_size_attuale = 0
    already_added_files = set()
    for file_name in repo_details['file_contents']:
        # search if it is pom or build.gradle
        if file_name.endswith('pom.xml') or file_name.endswith('build.gradle'):
            # add the file to the project
            file_id = generate_random_object_id()
            file_ids.append(file_id)
            extension = file_name.split('.')[-1]
            content = repo_details['file_contents'][file_name]
            
            
            repo_size_attuale += len(content)

            file.append({
                '_id': file_id,
                'owner': repo_owner,
                'projectName': project_name,
                'path': file_name,
                'type': extension,
                'size': len(content),
                'lines': len(content.split('\n')),
                'lastModified': convert_date(repo_details['commits'][0]['date']),
                'lastModifiedBy': repo_details['commits'][0]['author'], # todo
                'content': content
            })

            already_added_files.add(file_name)

    n_files = 0
    for file_name in repo_details['file_contents']:
        if file_name.startswith('.'):
            continue

        if not '.' in file_name:
            continue

        for subpath in file_name.split('/'):
            if subpath.startswith('.'):
                continue

        if not file_name.split('.')[-1] in ['java', 'xml', 'gradle', 'yml']:
            continue

        if file_name in already_added_files:
            continue

        file_id = generate_random_object_id()
        file_ids.append(file_id)
        extension = file_name.split('.')[-1]
        content = repo_details['file_contents'][file_name]
        
        if len(content) > 10000:
            #print(f"Skipping file {file_name} due to excessive size.")
            continue
    
        repo_size_attuale += len(content)

        file.append({
            '_id': file_id,
            'owner': repo_owner,
            'projectName': project_name,
            'path': file_name,
            'type': extension,
            'size': len(content),
            'lines': len(content.split('\n')),
            'lastModified': convert_date(repo_details['commits'][0]['date']),
            'lastModifiedBy': repo_details['commits'][0]['author'], # todo
            'content': content
        })
        n_files += 1

        if repo_size_attuale > 50000 or n_files > 500:  # 20 KB threshold
            print(f"Skipping repository {repo_full_name} due to excessive size: {repo_size_attuale} bytes, {n_files} files / {len(repo_details['file_contents'])} total files.")
            break


    

    comments = []
    issues_size = 0
    for issue in repo_details['issues'][:100]:
        if not issue['body']:
            continue

        users.add(issue['user']['login'])
        comment = {
            '_id': generate_random_object_id(),
            'username': issue['user']['login'],
            'text': issue['body'][:1000],  # Limit to 1000 characters
            'timestamp': convert_date_issue(issue['created_at']),
            'stars': random.randint(1, 10)  # Randomly generated stars for the comment
        }
        comments.append(comment)
        issues_size += len(issue['body'][:1000])

        if issue['user']['login'] not in user_ids_linking:
            user_ids_linking[issue['user']['login']] = {'commitIds': [], 'projectIds': [], 'commentIds': []}
        user_ids_linking[issue['user']['login']]['commentIds'].append(comment['_id'])


        if issues_size > 20000:
    #        print(f"Skipping repository {repo_full_name} due to excessive issues size: {issues_size} bytes.")
            break

    project.append({
        '_id': repo_id,
        'name': project_name,
        'description': generate_random_description(repo_details['file_contents']),
        'owner': repo_owner,
        'version': generate_random_version(),
        'creationDate': convert_date(repo_details['commits'][-1]['date']),
        'administrators': [u for u in contributors],
        'fileIds': file_ids,
        'comments': comments,
        'commitIds': commit_ids
    })


    if repo_owner not in user_ids_linking:
        user_ids_linking[repo_owner] = {'commitIds': [], 'projectIds': [], 'commentIds': []}
    user_ids_linking[repo_owner]['commentIds'].append(repo_id)


    #time.sleep(0.1)  # To avoid overwhelming the system with too many requests at once


Processing repository 1/1911: jwtk/jjwt
Skipping repository jwtk/jjwt due to excessive size: 56379 bytes, 1 files / 770 total files.
Processing repository 2/1911: spring-attic/hystrix-dashboard
Processing repository 3/1911: IBM/spring-cloud-kubernetes-with-istio
Processing repository 4/1911: tozny/java-aes-crypto
Processing repository 5/1911: pucamafra/RxNotification
Processing repository 6/1911: spartajet/javafx-boot-demo
Skipping repository spartajet/javafx-boot-demo due to no file contents.
Processing repository 7/1911: meyskens/NokiaRemote
Processing repository 8/1911: ViRb3/TrustMeAlready
Processing repository 9/1911: PetarMarijanovic/RxActivityResult
Processing repository 10/1911: soranico/tomcat-learn
Skipping repository soranico/tomcat-learn due to excessive size: 50609 bytes, 6 files / 4019 total files.
Processing repository 11/1911: su18/JDBC-Attack
Processing repository 12/1911: commonsguy/cwac-merge
Processing repository 13/1911: ddalcu/spring-starter
Processing repository 

In [None]:

# save documents in json files
with open('mongo_projects.json', 'w') as f:
    json.dump(project, f, indent=2)
with open('mongo_files.json', 'w') as f:
    json.dump(file, f, indent=2)
with open('mongo_commits.json', 'w') as f:
    json.dump(commit, f, indent=2)


with open('temp_users.json', 'w') as f:
    json.dump(list(users), f, indent=2)
with open('temp_user_ids_lining.json', 'w') as f:
    json.dump(user_ids_linking, f, indent=2)

#with open('mongo_comments.json', 'w') as f:
#    json.dump(comment, f, indent=2)


In [11]:
import requests
import time
from datetime import datetime
import os

from dotenv import load_dotenv

load_dotenv()  # Loads .env file into environment
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

# GitHub API configuration
GITHUB_API_URL = "https://api.github.com/users/"

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

def fetch_user_info(username):
    """Fetch user information from GitHub API."""
    try:
        response = requests.get(f"{GITHUB_API_URL}{username}", headers=headers)
        
        if response.status_code == 200:
            user_data = response.json()
            print(user_data)
            # {'login': 'wec246810',
            #  'id': 20639489, 
            #  'node_id': 'MDQ6VXNlcjIwNjM5NDg5', 
            #  'avatar_url': 'https://avatars.githubusercontent.com/u/20639489?v=4', 
            #  'gravatar_id': '', 
            #  'url': 'https://api.github.com/users/wec246810', 
            #  'html_url': 'https://github.com/wec246810', 
            #  'followers_url': 'https://api.github.com/users/wec246810/followers', 
            #  'following_url': 'https://api.github.com/users/wec246810/following{/other_user}', 
            #  'gists_url': 'https://api.github.com/users/wec246810/gists{/gist_id}', 
            #  'starred_url': 'https://api.github.com/users/wec246810/starred{/owner}{/repo}', 
            #  'subscriptions_url': 'https://api.github.com/users/wec246810/subscriptions', 
            #  'organizations_url': 'https://api.github.com/users/wec246810/orgs', 
            #  'repos_url': 'https://api.github.com/users/wec246810/repos', 
            #  'events_url': 'https://api.github.com/users/wec246810/events{/privacy}', 
            #  'received_events_url': 'https://api.github.com/users/wec246810/received_events', 
            #  'type': 'User', 
            #  'user_view_type': 'public', 
            #  'site_admin': False, 
            #  'name': None, 
            #  'company': None, 
            #  'blog': '', 
            #  'location': None, 
            #  'email': None, 
            #  'hireable': None, 
            #  'bio': None, 
            #  'twitter_username': None, 
            #  'public_repos': 19, 
            #  'public_gists': 0, 
            #  'followers': 7, 
            #  'following': 0, 
            #  'created_at': '2016-07-25T09:46:41Z', 
            #  'updated_at': '2025-04-13T05:38:35Z'}

            name_surname = user_data.get("name", "").split() if user_data.get("name") else None
            name = name_surname[0] if name_surname else None
            surname = " ".join(name_surname[1:]) if (name_surname and len(name_surname) > 1) else None

            

            if user_data.get("site_admin", False):
                # Append the raw response to a log file for debugging/auditing
                with open("github_user_api_responses.log", "a", encoding="utf-8") as logf:
                    logf.write(f"site_admin {username}: {response.text}\n")
            
            if user_data.get("type", "User") != "User":
                # Append the raw response to a log file for debugging/auditing
                with open("github_user_api_responses.log", "a", encoding="utf-8") as logf:
                    logf.write(f"{username}: {response.text}\n")

            res = {
                "_id": username,
                "password": return_bycript_password(username),  # Use username as password for testing simplicity
                "email": user_data.get("email", None),
                "name": name,
                "surname": surname,
                "nationality": user_data.get("location", None),
                "followerNumber": user_data.get("followers", 0),
                "followingNumber": user_data.get("following", 0),
                "commentIds": user_ids_linking[username]['commentIds'] if (username in user_ids_linking and len(user_ids_linking[username]['commentIds'])) else None,
                "projectIds": user_ids_linking[username]['projectIds'] if (username in user_ids_linking and len(user_ids_linking[username]['projectIds'])) else None,
                "commitIds": user_ids_linking[username]['commitIds'] if (username in user_ids_linking and len(user_ids_linking[username]['commitIds'])) else None,
                "isAdmin": user_data.get("site_admin", False),
                "registrationDate": user_data.get("created_at", datetime.now().isoformat())
            }

            # return only non None values of res
            return {k: v for k, v in res.items() if v is not None}

        elif response.status_code == 404:
            print(f"User {username} not found")
            return None
        elif response.status_code == 403:
            print(f"Rate limit exceeded for user {username}. Skipping...")
            return None
        else:
            print(f"Error fetching user {username}: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"Exception fetching user {username}: {e}")
        return None


In [None]:

# Convert users set to list for processing
#users_list = list(users) # uncomment first time
print(f"Total users to fetch: {len(users_list)}")

# Fetch all users
#all_users = []  # uncomment first time
#for i, username in enumerate(users_list[5813:]):
for i in range(5813, len(users_list)):
    username = users_list[i]
    print(f"Fetching user {i+1}/{len(users_list)}: {username}")
    
    user_info = fetch_user_info(username)
    if user_info:
        all_users.append(user_info)
    
    # Add small delay between requests to be respectful
    time.sleep(0.8) # api limit is 5k per hour

# Save final users data
with open('mongo_users.json', 'w') as f:
    json.dump(all_users, f, indent=2)

print(f"\nCompleted! Fetched {len(all_users)} users out of {len(users_list)} total users.")
print(f"Data saved to mongo_users.json")

In [81]:
len(all_users)
with open('mongo_users_corretto.json', 'w') as f:
    json.dump(all_users, f, indent=2)

In [78]:
all_users

[{'_id': 'wec246810',
  'password': '$2b$12$us2mBD923jOqc9OIaZp6uetbRDZHSRFP78jF0DexX3aErsO6bjlOO',
  'followerNumber': 7,
  'followingNumber': 0,
  'isAdmin': False,
  'registrationDate': '2016-07-25T09:46:41Z',
  'commentIds': ['68543ace8e6211d047bfaac4']},
 {'_id': 'mohitsatr',
  'password': '$2b$12$O1kgrXjatzuq3/A88iw4Pu2gqvP0yM88GNAbkj6Mdo2Kp5GoPAloC',
  'email': 'atrrmohit@gmail.com',
  'name': 'Mohit-Attry',
  'nationality': 'Punjab, India',
  'followerNumber': 3,
  'followingNumber': 30,
  'isAdmin': False,
  'registrationDate': '2021-08-21T15:32:28Z',
  'commentIds': ['68543adb8e6211d047c2c886',
   '68543adb8e6211d047c2c88c',
   '68543adb8e6211d047c2c88e',
   '68543adb8e6211d047c2c893',
   '68543adb8e6211d047c2c8a4',
   '68543adb8e6211d047c2c8af',
   '68543adb8e6211d047c2c8b1',
   '68543adb8e6211d047c2c8b5']},
 {'_id': 'zhangchaoxu',
  'password': '$2b$12$fIRelQTj5iHWoxwXYl0r8uTBseZcPBow0KpVD7d8Ue5z4Cy9Xk6Re',
  'email': 'zhangchaoxu@gmail.com',
  'name': 'Charles',
  'surname

In [77]:
#TODO rinomicare comments, projects in commitIds, projectIds

for i in range(len(all_users)):
    if 'comments' in all_users[i]:
        all_users[i]['commentIds'] = all_users[i]['comments']
    if 'projects' in all_users[i]:
        all_users[i]['projectIds'] = all_users[i]['projects']
    if 'comments' in all_users[i]:
        del all_users[i]['comments']
    if 'projects' in all_users[i]:
        del all_users[i]['projects']
    

In [22]:
print(len(all_users))
print(len(users))

usernames_in_all_users = set(u['_id'] for u in all_users)
missing_users = users - usernames_in_all_users
print(f"Users in 'users' but not in 'all_users': {len(missing_users)}")
print(list(missing_users)[:10])  # print first 10 as example

# Fetch info for missing users and append to all_users
for i, username in enumerate(missing_users):
    print(f"Fetching missing user {i+1}/{len(missing_users)}: {username}")
    user_info = fetch_user_info(username)
    if user_info:
        all_users.append(user_info)
    time.sleep(0.8)  # Respect API rate limits

print(f"After adding missing users, total users: {len(all_users)}")

25865
10057
Users in 'users' but not in 'all_users': 0
[]
After adding missing users, total users: 25865


In [24]:
# Save final users data
with open('mongo_users.json', 'w') as f:
    json.dump(all_users, f, indent=2)

In [18]:
import json

with open('mongo_users.json', 'r') as f:
    all_users = json.load(f)
print(f"Loaded {len(all_users)} users from mongo_users.json")

with open('mongo_projects.json', 'r') as f:
    all_projects = json.load(f)
print(f"Loaded {len(all_projects)} projects from mongo_projects.json")



Loaded 25476 users from mongo_users.json
Loaded 1830 projects from mongo_projects.json


In [19]:
len(all_users)

25476

In [20]:
# Add 'owner' to 'administrators' for each project
for project in all_projects:
    if 'administrators' in project:
        if project['owner'] not in project['administrators']:
            project['administrators'].append(project['owner'])
    else:
        project['administrators'] = [project['owner']]

# Collect all administrators into a set 'user'
users = set()
for project in all_projects:
    for admin in project['administrators']:
        users.add(admin)

In [23]:
25865-25476

389

In [29]:
import json

for project in all_projects:
    owner = project.get('owner')
    admins = project.get('administrators', [])
    if not owner in admins:
        print(f"Owner '{owner}' is not in administrators for project '{project.get('name')}'")

with open('mongo_projects.json', 'w') as f:
    json.dump(all_projects, f, indent=2)


In [2]:
import json
with open('mongo_projects.json', 'r') as f:
    all_projects = json.load(f)
print(f"Loaded {len(all_projects)} projects from mongo_projects.json")

with open('mongo_files.json', 'r') as f:
    all_files = json.load(f)
print(f"Loaded {len(all_files)} files from mongo_files.json")

# Build a mapping from file ID to project IDs
file_to_projects = {}
for project in all_projects:
    for fid in project.get('fileIds', []):
        if fid in file_to_projects:
            print('Error')
            break
        file_to_projects[fid] = project['_id']

# Add 'projectIds' field to each file
for f in all_files:
    f['projectId'] = file_to_projects.get(f['_id'], [])

Loaded 1830 projects from mongo_projects.json
Loaded 44964 files from mongo_files.json


In [4]:
all_files[0]

# save to neo4j
with open('neo4j_files.json', 'w') as f:
    json.dump(all_files, f, indent=2)