In [1]:
import requests
import pandas as pd
from datetime import datetime

# Your GitHub token
TOKEN = input("Enter API KeyL:")
headers = {"Authorization": f"token {TOKEN}"}

# Function to get Boston users
def get_Boston_users():
    users = []
    page = 1
    while True:
        url = f"https://api.github.com/search/users?q=location:Boston+followers:>100&page={page}"
        response = requests.get(url, headers=headers)
        data = response.json()

        if not data.get('items'):
            break

        users.extend(data['items'])
        page += 1
    return users

# Function to get user details
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    return response.json()

# Function to get repositories
def get_user_repos(username):
    repos = []
    page = 1
    while len(repos) < 500:
        url = f"https://api.github.com/users/{username}/repos?page={page}&per_page=100"
        response = requests.get(url, headers=headers)
        data = response.json()

        if not data:
            break

        repos.extend(data)
        page += 1
    return repos[:500]

# Clean company name
def clean_company(company):
    if not company:
        return ""
    return company.strip().lstrip('@').upper()

In [None]:
# Get users
Boston_users = get_Boston_users()
# Prepare user data
users_data = []
repos_data = []

for user in Boston_users:
    details = get_user_details(user['login'])

    # Add user data
    users_data.append({
        'login': details['login'],
        'name': details.get('name', ''),
        'company': clean_company(details.get('company', '')),
        'location': details.get('location', ''),
        'email': details.get('email', ''),
        'hireable': str(details.get('hireable', '')).lower(),
        'bio': details.get('bio', ''),
        'public_repos': details['public_repos'],
        'followers': details['followers'],
        'following': details['following'],
        'created_at': details['created_at']
    })

    # Get repositories
    repos = get_user_repos(details['login'])
    for repo in repos:
        # Safely get license information
        license_info = repo.get('license')
        license_key = license_info.get('key', '') if license_info else ''

        repos_data.append({
            'login': details['login'],
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo.get('language', ''),
            'has_projects': str(repo['has_projects']).lower(),
            'has_wiki': str(repo['has_wiki']).lower(),
            'license_name': license_key
        })

# Save to CSV
pd.DataFrame(users_data).to_csv('users.csv', index=False)
pd.DataFrame(repos_data).to_csv('repositories.csv', index=False)

In [None]:
# import pandas as pd
# import numpy as np
# from datetime import datetime

# # Read the CSV files
# users_df = pd.read_csv('users.csv')
# repos_df = pd.read_csv('repositories.csv')

# # 1. Top 5 users by followers
# def top_followers():
#     return ','.join(users_df.nlargest(5, 'followers')['login'].tolist())

# # 2. 5 earliest registered users
# def earliest_users():
#     users_df['created_at'] = pd.to_datetime(users_df['created_at'])
#     return ','.join(users_df.nsmallest(5, 'created_at')['login'].tolist())

# # 3. Most popular licenses
# def popular_licenses():
#     return ','.join(repos_df['license_name'].value_counts().head(3).index.tolist())

# # 4. Most common company
# def most_common_company():
#     return users_df['company'].mode()[0]

# # 5. Most popular language
# def most_popular_language():
#     return repos_df['language'].mode()[0]

# # 6. Second most popular language for users after 2020
# def second_popular_language_post_2020():
#     users_df['created_at'] = pd.to_datetime(users_df['created_at'])
#     recent_users = users_df[users_df['created_at'].dt.year > 2020]['login']
#     recent_repos = repos_df[repos_df['login'].isin(recent_users)]
#     return recent_repos['language'].value_counts().index[1]

# # 7. Language with highest average stars
# def highest_avg_stars_language():
#     return repos_df.groupby('language')['stargazers_count'].mean().idxmax()

# # 8. Top 5 by leader_strength
# def top_leader_strength():
#     users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
#     return ','.join(users_df.nlargest(5, 'leader_strength')['login'].tolist())

# # 9. Correlation between followers and repos
# def followers_repos_correlation():
#     return round(users_df['followers'].corr(users_df['public_repos']), 3)

# # 10. Regression slope of followers on repos
# from sklearn.linear_model import LinearRegression
# def followers_repos_regression():
#     X = users_df[['public_repos']]
#     y = users_df['followers']
#     reg = LinearRegression().fit(X, y)
#     return round(reg.coef_[0], 3)

# # 11. Correlation between projects and wiki
# def projects_wiki_correlation():
#     repos_df['has_projects'] = repos_df['has_projects'].map({'true': 1, 'false': 0})
#     repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': 1, 'false': 0})
#     return round(repos_df['has_projects'].corr(repos_df['has_wiki']), 3)

# # 12. Hireable users following difference
# def hireable_following_diff():
#     hireable = users_df[users_df['hireable'] == 'true']['following'].mean()
#     not_hireable = users_df[users_df['hireable'] != 'true']['following'].mean()
#     return round(hireable - not_hireable, 3)

# # 13. Bio length correlation with followers
# def bio_followers_correlation():
#     users_df['bio_words'] = users_df['bio'].fillna('').str.split().str.len()
#     users_with_bio = users_df[users_df['bio_words'] > 0]
#     X = users_with_bio[['bio_words']]
#     y = users_with_bio['followers']
#     reg = LinearRegression().fit(X, y)
#     return round(reg.coef_[0], 3)

# # 14. Top weekend repository creators
# def weekend_repo_creators():
#     repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
#     weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek.isin([5, 6])]
#     weekend_counts = weekend_repos['login'].value_counts()
#     return ','.join(weekend_counts.head(5).index.tolist())

# # 15. Hireable email sharing difference
# def hireable_email_diff():
#     hireable_email = users_df[users_df['hireable'] == 'true']['email'].notna().mean()
#     other_email = users_df[users_df['hireable'] != 'true']['email'].notna().mean()
#     return round(hireable_email - other_email, 3)

# # 16. Most common surname
# def most_common_surname():
#     users_df['surname'] = users_df['name'].str.split().str[-1]
#     return users_df['surname'].value_counts().iloc[0]

# # Print all results
# print("1.", top_followers())
# print("2.", earliest_users())
# print("3.", popular_licenses())
# print("4.", most_common_company())
# print("5.", most_popular_language())
# print("6.", second_popular_language_post_2020())
# print("7.", highest_avg_stars_language())
# print("8.", top_leader_strength())
# print("9.", followers_repos_correlation())
# print("10.", followers_repos_regression())
# print("11.", projects_wiki_correlation())
# print("12.", hireable_following_diff())
# print("13.", bio_followers_correlation())
# print("14.", weekend_repo_creators())
# print("15.", hireable_email_diff())
# print("16.", most_common_surname())