In [1]:
import requests
import csv

GITHUB_TOKEN = "token"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_Tokyo():
    users = []
    query = "location:Tokyo+followers:>200"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    detailed_users = []
    for user in users:
        user_info = get_user_details(user['login'])
        detailed_users.append(user_info)

    return detailed_users

def get_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    user_data = requests.get(user_url, headers=HEADERS).json()

    return {
        'login': user_data['login'],
        'name': user_data['name'],
        'company': clean_company_name(user_data['company']),
        'location': user_data['location'],
        'email': user_data['email'],
        'hireable': user_data['hireable'],
        'bio': user_data['bio'],
        'public_repos': user_data['public_repos'],
        'followers': user_data['followers'],
        'following': user_data['following'],
        'created_at': user_data['created_at'],
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company

def get_user_repos(username):
    repos_url = f"https://api.github.com/users/{username}/repos?per_page=500"
    response = requests.get(repos_url, headers=HEADERS)
    repos_data = response.json()

    repos = []
    for repo in repos_data:
        repos.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })

    return repos

def save_users_to_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])
        writer.writeheader()
        writer.writerows(users)

def save_repos_to_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])
        writer.writeheader()
        writer.writerows(repos)

if __name__ == "__main__":
    users = get_users_in_Tokyo()
    save_users_to_csv(users)

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_repos_to_csv(all_repos)
    print("Done")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Done


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,dennybritz,Denny Britz,,"Tokyo, Japan",,,"High-school dropout. Ex Google Brain, Stanford...",61,9471,6,2010-09-16T18:32:26Z
1,wasabeef,Daichi Furiya,"CYBERAGENT, INC.","Tokyo, Japan",,True,Google Developers Expert for Android,46,9369,60,2012-06-09T13:46:37Z
2,dai-shi,Daishi Kato,FREELANCER,Tokyo,,True,"React library author, maintaining three state ...",128,6945,1,2010-11-21T12:26:31Z
3,rui314,Rui Ueyama,BLUE WHALE SYSTEMS,Tokyo,rui314@gmail.com,,,44,5938,8,2009-01-18T01:29:27Z
4,domenic,Domenic Denicola,GOOGLE,"Tokyo, Japan",d@domenic.me,,,216,5405,79,2011-02-14T15:26:22Z


In [5]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

  users['hireable'] = users['hireable'].fillna(False).astype(bool)


In [6]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

dennybritz,wasabeef,dai-shi,rui314,domenic


In [8]:
#q2
users['created_at'] = pd.to_datetime(users['created_at'])
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

kana,kakutani,mootoh,lhl,walf443


In [10]:
#q3
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,dennybritz,dennybritz/ablog-content,2020-05-11T13:12:41Z,9,9,Jupyter Notebook,True,True,
1,dennybritz,dennybritz/actioncrawler,2015-12-18T12:08:07Z,1,1,JavaScript,True,True,
2,dennybritz,dennybritz/akka-cluster-deploy,2014-09-12T12:17:07Z,26,26,Scala,True,True,
3,dennybritz,dennybritz/analysis-jobdata,2014-08-21T11:23:39Z,0,0,Python,True,True,
4,dennybritz,dennybritz/arrow-datafusion,2023-02-25T11:11:16Z,0,0,Rust,True,False,apache-2.0


In [11]:
repos['license_name'].value_counts().head(3)

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,11926
apache-2.0,3213
other,3041


In [13]:
#q4
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
GOOGLE,12


In [None]:
#q5


In [14]:
repos['language'].value_counts().head(1)

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,5328


In [15]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
15,asahilina,Asahi Lina,ASAHILINUX,Tokyo,lina@asahilina.net,False,"Hello everyone, Asahi Lina here! I'm a Linux d...",17,3016,0,2022-03-21 11:29:54+00:00
16,Sen-Takatsuki,Yoshimura Eto,,"Tokyo, Japan",,False,こんにちは! \r\n(o゜▽゜)o,12,2938,209,2020-07-28 11:07:32+00:00
20,pilcrowonpaper,pilcrow,LUCIA-AUTH,"Tokyo, Japan",pilcrowonpaper@gmail.com,False,I like making stuff,50,2620,0,2021-03-14 10:45:53+00:00
84,SakanaAI,Sakana AI,,Tokyo,info at sakana dot ai,False,On a quest to create a new kind of foundation ...,6,852,0,2023-07-31 00:18:23+00:00
116,codeninja819,Jiro Matsumoto,HASHNINJAS,"Tokyo, Japan",jiromatsumoto98@gmail.com,True,Web3 or Nothing,50,662,5122,2023-02-17 15:07:47+00:00


In [16]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
TypeScript,65
Rust,53
Swift,32
JavaScript,29
HTML,16


In [17]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Assembly 2103.0


In [18]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

blueimp,dai-shi,asahilina,pilcrowonpaper,marcan


In [19]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.04967727482378189

In [22]:
#q10
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.272


In [23]:
#q11
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.38


In [24]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

-78.47418661755677

In [25]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

2.1901937900610577

In [26]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

h6ah4i,qnighy,yutkat,syumai,suzuki-shunsuke


In [27]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.1307243707796194

In [28]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Kato,Tanaka


In [29]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [31]:
# prompt: change all empty column(NaN) of hireable to False

users['hireable'] = users['hireable'].fillna(False)

In [33]:
# prompt: download the above updated users.csv which need to change all empty column(NaN) of hireable to False

import requests
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from collections import Counter
from datetime import datetime

GITHUB_TOKEN = "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"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_Tokyo():
    users = []
    query = "location:Tokyo+followers:>200"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    detailed_users = []
    for user in users:
        user_info = get_user_details(user['login'])
        detailed_users.append(user_info)

    return detailed_users

def get_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    user_data = requests.get(user_url, headers=HEADERS).json()

    return {
        'login': user_data['login'],
        'name': user_data['name'],
        'company': clean_company_name(user_data['company']),
        'location': user_data['location'],
        'email': user_data['email'],
        'hireable': user_data['hireable'],
        'bio': user_data['bio'],
        'public_repos': user_data['public_repos'],
        'followers': user_data['followers'],
        'following': user_data['following'],
        'created_at': user_data['created_at'],
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company

def get_user_repos(username):
    repos_url = f"https://api.github.com/users/{username}/repos?per_page=500"
    response = requests.get(repos_url, headers=HEADERS)
    repos_data = response.json()

    repos = []
    for repo in repos_data:
        repos.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })

    return repos

def save_users_to_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])
        writer.writeheader()
        writer.writerows(users)

def save_repos_to_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])
        writer.writeheader()
        writer.writerows(repos)

if __name__ == "__main__":
    users = get_users_in_Tokyo()
    save_users_to_csv(users)

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_repos_to_csv(all_repos)
    print("Done")
users = pd.read_csv('users.csv')
users.head()
users['hireable'] = users['hireable'].fillna(False).astype(bool)
users.to_csv('users.csv', index=False)
from google.colab import files
files.download('users.csv')

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Done


  users['hireable'] = users['hireable'].fillna(False).astype(bool)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>