In [None]:
#GITHUB_TOKEN="dummy"

In [43]:
import os
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression


Function to Fetch Users in Seattle with Over 200 FollowersFunction to Fetch Users in Seattle with Over 200 Followers

In [44]:
def fetch_users_in_seattle(min_followers=200):
    url = 'https://api.github.com/search/users'
    headers = {
    "Authorization": f"Bearer {"dummy"}"
}
    params = {'q': 'location:Seattle followers:>200', 'per_page': 100, 'page': 1}
    users = []

    while True:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print("Error:", response.json())
            break

        data = response.json()
        users.extend(data['items'])

        if len(data['items']) < 100:
            break
        params['page'] += 1

    return users


Function to Clean and Format User Data

In [45]:
def clean_user_data(users):
    user_data = []
    for user in users:
        user_detail = requests.get(user['url'],headers = {
    "Authorization": f"Bearer {"dummy"}"
})
        user_info = user_detail.json()

        cleaned_user = {
            'login': user_info.get('login', ''),
            'name': user_info.get('name', ''),
            'company': (user_info.get('company', '').lstrip('@').upper().strip() if user_info.get('company') else ''),
            'location': user_info.get('location', ''),
            'email': user_info.get('email', ''),
            'hireable': user_info.get('hireable', ''),
            'bio': user_info.get('bio', ''),
            'public_repos': user_info.get('public_repos', ''),
            'followers': user_info.get('followers', ''),
            'following': user_info.get('following', ''),
            'created_at': user_info.get('created_at', '')
        }
        user_data.append(cleaned_user)
    return pd.DataFrame(user_data)


Function to Fetch Repositories for Each User

In [46]:
def fetch_user_repositories(username):
    url = f'https://api.github.com/users/{username}/repos'
    headers = {
    "Authorization": f"Bearer {"dummy"}"
}
    params = {'per_page': 500, 'sort': 'pushed'}
    repos = []

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        repos = response.json()
    else:
        print("Error fetching repos for", username)

    repo_data = []
    for repo in repos:
        repo_data.append({
            'login': username,
            'full_name': repo.get('full_name', ''),
            'created_at': repo.get('created_at', ''),
            'stargazers_count': repo.get('stargazers_count', ''),
            'watchers_count': repo.get('watchers_count', ''),
            'language': repo.get('language', ''),
            'has_projects': repo.get('has_projects', ''),
            'has_wiki': repo.get('has_wiki', ''),
            'license_name': repo.get('license', {}).get('name', '') if repo.get('license') else ''
        })
    return pd.DataFrame(repo_data)


Fetch and Save Data

In [47]:
# Fetch users and save to CSV
users = fetch_users_in_seattle()
users_df = clean_user_data(users)
users_df.to_csv('users.csv', index=False)

In [49]:
# Fetch repositories and save to CSV
all_repos = pd.concat([fetch_user_repositories(user['login']) for user in users_df.to_dict(orient='records')])
all_repos.to_csv('repositories.csv', index=False)

Analysis and Answer Generation

In [62]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the data
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Convert 'created_at' column to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Get top users by followers
top_users = users_df.nlargest(5, 'followers')['login'].tolist()
print("Top Users by Followers:", ', '.join(top_users))

# Get earliest users
earliest_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
print("Earliest Users:", ', '.join(earliest_users))

# Get popular licenses
popular_licenses = repos_df['license_name'].value_counts().nlargest(3).index.tolist()
print("Top Licenses:", ', '.join(popular_licenses))

# Get the most common company
common_company = users_df['company'].value_counts().idxmax()
print("Most Common Company:", common_company)

# Get the most popular language
popular_language = repos_df['language'].value_counts().idxmax()
print("Most Popular Language:", popular_language)

# Get recent users and their second most popular language
recent_users = users_df[users_df['created_at'] >= '2020-01-01']
second_popular_language = repos_df[repos_df['login'].isin(recent_users['login'])]['language'].value_counts().index[1]
print("Second Most Popular Language (Post-2020):", second_popular_language)

# Language with highest average stars
language_avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with Highest Avg Stars:", language_avg_stars)

# Calculate leader strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print("Top by Leader Strength:", ', '.join(top_leader_strength))

# Correlation between followers and public repos
correlation = users_df['followers'].corr(users_df['public_repos'])
print("Correlation (Followers vs. Public Repos):", round(correlation, 3))

# Linear regression on followers based on public repos
reg = LinearRegression().fit(users_df[['public_repos']], users_df['followers'])
print("Regression Slope (Followers on Repos):", round(reg.coef_[0], 3))

# Correlation between projects and wiki
corr_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Correlation (Projects vs. Wiki):", round(corr_projects_wiki, 3))

# Difference in following for hireable users
diff_following = users_df[users_df['hireable'] == True]['following'].mean() - users_df[users_df['hireable'] != True]['following'].mean()
print("Difference in Following (Hireable vs. Not):", round(diff_following, 3))

# Followers per word in bio
users_df['bio_word_count'] = users_df['bio'].fillna('').apply(lambda x: len(x.split()))
reg_bio = LinearRegression().fit(users_df[['bio_word_count']], users_df['followers'])
print("Followers per Word in Bio:", round(reg_bio.coef_[0], 3))

# Top weekend repo creators
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
weekend_repos = repos_df[repos_df['created_at'].dt.weekday >= 5]
top_weekend_creators = weekend_repos['login'].value_counts().nlargest(5).index.tolist()
print("Top Weekend Repo Creators:", ', '.join(top_weekend_creators))

# Email sharing difference for hireable vs. non-hireable
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] != True]['email'].notna().mean()
email_diff = round(hireable_with_email - non_hireable_with_email, 3)
print("Email Sharing Diff (Hireable vs. Not):", email_diff)


Top Users by Followers: vczh, bradfitz, munificent, tenderlove, ahmetb
Earliest Users: topfunky, nex3, beccasaurus, eric, grantr
Top Licenses: MIT License, Apache License 2.0, Other
Most Common Company: MICROSOFT
Most Popular Language: JavaScript
Second Most Popular Language (Post-2020): Python
Language with Highest Avg Stars: Haml
Top by Leader Strength: awslabs, mission-peace, karan, cmuratori, nex3
Correlation (Followers vs. Public Repos): 0.203
Regression Slope (Followers on Repos): 2.499
Correlation (Projects vs. Wiki): 0.324
Difference in Following (Hireable vs. Not): 12.953
Followers per Word in Bio: 2.806
Top Weekend Repo Creators: homebysix, brandonbloom, anvaka, ryanoasis, nolanlawson
Email Sharing Diff (Hireable vs. Not): 0.082


In [61]:
import pandas as pd

# Extract surname: last word in each name
users_df['surname'] = users_df['name'].apply(lambda x: x.strip().split()[-1] if pd.notna(x) and len(x.strip().split()) > 0 else '')

# Filter out empty surnames
surnames = users_df['surname'].dropna().replace('', pd.NA).dropna()

# Count occurrences of each surname
surname_counts = surnames.value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count and sort them alphabetically
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()
most_common_surnames.sort()

# Join them into a comma-separated string
result = ', '.join(most_common_surnames)

print("Most Common Surname(s):", result)


Most Common Surname(s): Wang
