In [None]:

import requests
import pandas as pd

GITHUB_TOKEN = 'token no here...'
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

def fetch_users():
    users = []
    page = 1
    while True:
        url = f"https://api.github.com/search/users?q=location:seattle+followers:>200&page={page}"
        response = requests.get(url, headers=headers)
        data = response.json()

        if 'items' not in data:
            break

        users.extend(data['items'])
        if len(data['items']) == 0:
            break
        page += 1

    return users


def get_detailed_user_info(users):
    user_data = []
    for user in users:
        user_response = requests.get(f"https://api.github.com/users/{user['login']}", headers=headers)
        user_info = user_response.json()
        company = user_info.get('company', '').strip().lstrip('@').upper() if user_info.get('company') else None
        user_data.append({
            'login': user_info['login'],
            'name': user_info.get('name'),
            'company': company,
            'location': user_info.get('location'),
            'email': user_info.get('email'),
            'hireable': user_info.get('hireable', False),
            'bio': user_info.get('bio'),
            'public_repos': user_info.get('public_repos'),
            'followers': user_info.get('followers'),
            'following': user_info.get('following'),
            'created_at': user_info.get('created_at')
        })
    return user_data


def save_users_to_csv(user_data):
    users_df = pd.DataFrame(user_data)
    users_df.to_csv('users.csv', index=False)

users = fetch_users()
user_data = get_detailed_user_info(users)
save_users_to_csv(user_data)


def fetch_repos_for_user(user_login):
    repo_data = []
    url = f"https://api.github.com/users/{user_login}/repos?per_page=100"
    response = requests.get(url, headers=headers)
    repos = response.json()
    for repo in repos:
        repo_data.append({
            'login': user_login,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['name'] if repo['license'] else None
        })
    return repo_data



def fetch_all_repositories():
    users_df = pd.read_csv('users.csv')
    all_repos = []
    for login in users_df['login']:
        all_repos.extend(fetch_repos_for_user(login))
    return all_repos

repos = fetch_all_repositories()
repos_df = pd.DataFrame(repos)
repos_df.to_csv('repositories.csv', index=False)


users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')


import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression


users_df['hireable'] = users_df['hireable'].fillna(False).astype(bool)

#1
top_5_users = users_df.nlargest(5, 'followers')['login'].tolist()
print(','.join(top_5_users))

#2
users['created_at'] = pd.to_datetime(users['created_at'])
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))


#3
repos['license_name'].value_counts().head(4)


#4
users['company'].value_counts().head(1)



#5
repos['language'].value_counts().head(1)

#6
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

#7
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

#8
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

#9
correlation =users['followers'].corr(users['public_repos'])
correlation


#10
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)
    
    print(f"{slope:.3f}")
else:
    print("Error")


#11
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")



#12
users_df = pd.read_csv('users.csv')
hireable_avg_following = users_df[users_df['hireable'] == True]['following'].mean()
non_hireable_avg_following = users_df[users_df['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
print(f"Difference in following count: {difference:.3f}")

#13
users_df = users_df.dropna(subset=['bio'])
users_df['bio_length'] = users_df['bio'].apply(len)
bio_followers_correlation = users_df['bio_length'].corr(users_df['followers'])
print(f"Correlation of bio length with followers: {bio_followers_correlation:.3f}")



#14
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
weekend_repos = repos_df[repos_df['created_at'].dt.weekday >= 5]
weekend_repo_counts = weekend_repos['login'].value_counts().head(5)
print("Top users by weekend-created repos:", ", ".join(weekend_repo_counts.index))



#15
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable


#16
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))


from google.colab import files

files.download('users.csv')

files.download('repositories.csv')
