# Scraping the Data

In [7]:
import requests
import csv
import re
import time

headers = {
    "Authorization": "Bearer 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",
    "Accept": "application/vnd.github.v3+json"
}

def clean_company_name(company):
    if company:
        company = company.strip()
        if company.startswith("@"):
            company = company[1:]
        return company.upper()
    return None

def fetch_users_in_sydney(min_followers=100):
    users_url = f"https://api.github.com/search/users?q=location:Sydney+followers:>{min_followers}&per_page=100"
    response = requests.get(users_url, headers=headers)
    response.raise_for_status()
    users = response.json().get('items', [])
    return users

def fetch_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    response = requests.get(user_url, headers=headers)
    response.raise_for_status()
    return response.json()

def fetch_user_repositories(username):
    repos = []
    page = 1
    while page <= 5:  
        repos_url = f"https://api.github.com/users/{username}/repos?sort=pushed&per_page=100&page={page}"
        response = requests.get(repos_url, headers=headers)
        response.raise_for_status()
        repos += response.json()
        if len(response.json()) < 100:
            break
        page += 1
        time.sleep(1)  
    return repos

def write_users_to_csv(users):
    with open("users.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"])
        
        for user in users:
            user_data = fetch_user_details(user['login'])
            writer.writerow([
                user_data["login"],
                user_data.get("name"),
                clean_company_name(user_data.get("company")),
                user_data.get("location"),
                user_data.get("email"),
                user_data.get("hireable"),
                user_data.get("bio"),
                user_data.get("public_repos"),
                user_data.get("followers"),
                user_data.get("following"),
                user_data.get("created_at")
            ])

def write_repositories_to_csv(users):
    with open("repositories.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["login", "full_name", "created_at", "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"])
        
        for user in users:
            repos = fetch_user_repositories(user['login'])
            for repo in repos:
                writer.writerow([
                    user['login'],
                    repo["full_name"],
                    repo["created_at"],
                    repo["stargazers_count"],
                    repo["watchers_count"],
                    repo["language"],
                    repo["has_projects"],
                    repo["has_wiki"],
                    repo["license"]["key"] if repo["license"] else None
                ])

def main():
    users = fetch_users_in_sydney()

    write_users_to_csv(users)

    write_repositories_to_csv(users)

if __name__ == "__main__":
    main()


# Answers to Each Question

In [20]:
import pandas as pd
user = pd.read_csv('users.csv')

In [21]:
user.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,nicknochnack,Nicholas Renotte,,"Sydney, Australia",,,Data Science guy @IBM,193,11703,5,2013-11-15T15:06:35Z
1,brendangregg,Brendan Gregg,INTEL,"Sydney, Australia",,,Cloud computing performance engineer.,42,8024,0,2011-10-04T08:48:56Z
2,cornflourblue,Jason Watmore,POINT BLANK DEVELOPMENT,Sydney Australia,,,"Web Developer and Blogger, currently focusing ...",187,6701,0,2011-05-13T04:38:16Z
3,0vm,Ak,,"Sydney, Australia",,True,erm hai,10,6569,3,2021-03-02T06:55:10Z
4,davecheney,Dave Cheney,GITHUB,"Sydney, Australia",,,I service society by rocking.,163,6556,42,2008-04-14T04:00:20Z


In [22]:
top_5_users = user.sort_values(by='followers', ascending=False).head(5)
top_5_logins = ", ".join(top_5_users['login'])
top_5_logins

'nicknochnack, brendangregg, cornflourblue, 0vm, davecheney'

In [24]:
earliest_5_users = user.sort_values(by='created_at', ascending=True).head(5)
earliest_5_logins = ", ".join(earliest_5_users['login'])
print("5 Earliest Registered Users:", earliest_5_logins)


5 Earliest Registered Users: dylanegan, cjheath, freshtonic, dhowden, mikel


In [1]:
repo = pd.read_csv('repositories.csv')

NameError: name 'pd' is not defined

In [2]:
repo.head()

NameError: name 'repo' is not defined

In [33]:
from collections import Counter
licenses = repo['license_name'].value_counts()

In [34]:
licenses

mit                   13400
other                  4053
apache-2.0             3944
gpl-3.0                1641
bsd-3-clause           1113
gpl-2.0                 409
bsd-2-clause            352
agpl-3.0                344
unlicense               344
cc0-1.0                 330
isc                     220
cc-by-4.0               185
mpl-2.0                 158
lgpl-3.0                134
lgpl-2.1                111
cc-by-sa-4.0             55
wtfpl                    42
epl-1.0                  40
zlib                     24
mit-0                    21
bsl-1.0                  13
epl-2.0                  11
0bsd                      9
bsd-4-clause              9
bsd-3-clause-clear        8
ms-pl                     7
lppl-1.3c                 3
cecill-2.1                2
afl-3.0                   2
eupl-1.2                  2
artistic-2.0              1
Name: license_name, dtype: int64

In [36]:
company = user['company'].value_counts()
company

ATLASSIAN                                 19
CANVA                                     18
GOOGLE                                     8
MICROSOFT                                  5
UNIVERSITY OF TECHNOLOGY SYDNEY            5
                                          ..
NEUROMASTER-TOKEN                          1
CLOUDWAVE                                  1
BIGFOOTDS                                  1
STORECONNECT, METAPULSE, REINTERACTIVE     1
ALGOLIA (PREV @SAJARI)                     1
Name: company, Length: 148, dtype: int64

In [37]:
lang = repo['language'].value_counts()
lang

JavaScript     6135
Python         4494
TypeScript     1615
Swift          1530
Ruby           1301
               ... 
Rascal            1
Text              1
Vim Snippet       1
GDB               1
MQL4              1
Name: language, Length: 211, dtype: int64

In [39]:
joined_after_2020 = user[user['created_at'] > '2020-01-01']
repos_after_2020 = pd.merge(joined_after_2020[['login']], repo, on='login')
language_counts = repos_after_2020['language'].value_counts()
second_most_popular_language = language_counts.index[1]
print("Second most popular language among users joined after 2020:", second_most_popular_language)

Second most popular language among users joined after 2020: TypeScript


In [40]:
avg_stars = repo.groupby('language')['stargazers_count'].mean()
highest_avg_stars_language = avg_stars.idxmax()
print("Language with the highest average number of stars per repository:", highest_avg_stars_language)

Language with the highest average number of stars per repository: Mermaid


In [41]:
user['leader_strength'] = user['followers'] / (1 + user['following'])
top_5_leaders = user.nlargest(5, 'leader_strength')
top_5_leader_logins = ", ".join(top_5_leaders['login'])
print("Top 5 users by leader strength:", top_5_leader_logins)

Top 5 users by leader strength: brendangregg, cornflourblue, Canva, nicknochnack, 0vm


In [42]:
correlation_followers_repos = user['followers'].corr(user['public_repos'])
print(f"Correlation between followers and public repos: {correlation_followers_repos:.3f}")

Correlation between followers and public repos: 0.035


In [43]:
from scipy.stats import linregress
slope, _, _, _, _ = linregress(user['public_repos'], user['followers'])
print(f"Regression slope of followers on repos: {slope:.3f}")

Regression slope of followers on repos: 0.068


In [7]:
correlation_projects_wiki = repo['has_projects'].corr(repo['has_wiki'])
print(f"Correlation between projects and wiki enabled: {correlation_projects_wiki:.3f}")


Correlation between projects and wiki enabled: 0.220


In [48]:
avg_following_hireable = user[user['hireable'] == True]['following'].mean()
avg_following_non_hireable = user[user['hireable'] == False]['following'].mean()
following_difference = avg_following_hireable - avg_following_non_hireable
print(f"Difference in average following between hireable and non-hireable users: {following_difference:.3f}")


Difference in average following between hireable and non-hireable users: nan


In [49]:
from sklearn.linear_model import LinearRegression

users_with_bio = user[user['bio'].notna()]

users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()

X = users_with_bio[['bio_word_count']]
y = users_with_bio['followers']
model = LinearRegression().fit(X, y)

slope = model.coef_[0]
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: -10.884


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()


In [50]:
repo['created_at'] = pd.to_datetime(repo['created_at'], utc=True)

repos_weekend = repo[repo['created_at'].dt.weekday >= 5]

weekend_repos_counts = repos_weekend['login'].value_counts().head(5)
top_5_weekend_users = ", ".join(weekend_repos_counts.index)
print(f"Top 5 users with the most repositories created on weekends: {top_5_weekend_users}")


Top 5 users with the most repositories created on weekends: johndpope, mvandermeulen, timgates42, mikeyhodl, pinkforest


In [51]:
email_fraction_hireable = user[user['hireable'] == True]['email'].notna().mean()
email_fraction_non_hireable = user[user['hireable'] == False]['email'].notna().mean()
email_difference = email_fraction_hireable - email_fraction_non_hireable
print(f"Difference in email sharing between hireable and non-hireable users: {email_difference:.3f}")

Difference in email sharing between hireable and non-hireable users: nan


In [52]:
user['surname'] = user['name'].dropna().str.strip().str.split().str[-1]
most_common_surnames = user['surname'].value_counts()
max_count = most_common_surnames.iloc[0]
top_surnames = sorted(most_common_surnames[most_common_surnames == max_count].index)
common_surname_str = ", ".join(top_surnames)
print(f"Most common surname(s): {common_surname_str}")

Most common surname(s): Wu, Zhang
