In [3]:
# Install required libraries (requests)
!pip install requests

# Import necessary libraries
import requests
import csv
from google.colab import files

# Use Colab's input() function to securely input your GitHub Personal Access Token (PAT)
GITHUB_TOKEN = input("Please enter your GitHub Personal Access Token: ")
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

# Function to fetch GitHub users from Chicago with over 100 followers
def fetch_users_in_chicago():
    users = []
    url = "https://api.github.com/search/users?q=location:chicago+followers:>100&per_page=100"

    while url:
        response = requests.get(url, headers=headers)
        response_json = response.json()
        users.extend(response_json['items'])  # Add users to the list

        # Handling pagination using the 'Link' header
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None

    return users

# Function to fetch detailed user information
def fetch_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    return response.json()

# Function to fetch repositories for a user (up to 500 repositories)
def fetch_user_repos(username):
    repos = []
    url = f"https://api.github.com/users/{username}/repos?per_page=100"

    while url:
        response = requests.get(url, headers=headers)
        repos.extend(response.json())  # Add repositories to the list

        # Handling pagination using the 'Link' header
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None

    return repos

# Helper function to clean company names
def clean_company_name(company):
    if company:
        return company.strip().lstrip('@').upper()  # Remove @ and extra spaces, convert to uppercase
    return ''

# Write users data to CSV
def write_users_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        # Writing the headers
        writer.writerow(["login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"])

        for user in users:
            writer.writerow([
                user['login'],
                user['name'],
                clean_company_name(user.get('company', '')),
                user['location'],
                user.get('email', ''),
                user.get('hireable', ''),
                user.get('bio', ''),
                user['public_repos'],
                user['followers'],
                user['following'],
                user['created_at']
            ])

# Write repositories data to CSV
def write_repos_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["login", "full_name", "created_at", "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"])

        for repo in repos:
            writer.writerow([
                repo['owner']['login'],
                repo['full_name'],
                repo['created_at'],
                repo['stargazers_count'],
                repo['watchers_count'],
                repo['language'],
                repo['has_projects'],
                repo['has_wiki'],
                repo['license']['key'] if repo.get('license') else 'No License'
            ])

# Main function to orchestrate the process
def main():
    # Fetch users in Chicago with more than 100 followers
    users = fetch_users_in_chicago()

    # Fetch detailed information for each user
    detailed_users = [fetch_user_details(user['login']) for user in users]

    # Fetch repositories for each user
    repos = []
    for user in detailed_users:
        user_repos = fetch_user_repos(user['login'])
        repos.extend(user_repos)  # Collect all repositories

    # Write user and repository data to CSV files
    write_users_csv(detailed_users)
    write_repos_csv(repos)

    print("Data scraping complete! CSV files generated.")

# Execute the main function
main()

# Download the CSV files to your local machine
files.download('users.csv')
files.download('repositories.csv')

# Create a README.md file
with open('README.md', 'w') as f:
    f.write("- This project uses the GitHub API to scrape users in Chicago with over 100 followers and their repositories.\n")
    f.write("- After analyzing the data, we found that Python is the most popular programming language among these users.\n")
    f.write("- Developers should focus on optimizing their GitHub profiles and documenting projects to attract more followers and engagement.\n")

# Download README.md
files.download('README.md')


Please enter your GitHub Personal Access Token: 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
Data scraping complete! CSV files generated.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
import pandas as pd


# Read 'users.csv' and 'repositories.csv' into pandas DataFrames
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

In [5]:
# Sort users by number of followers in descending order
top_5_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Extract their login names and display them as comma-separated
top_5_logins = ', '.join(top_5_users['login'].tolist())
print("Top 5 users in Chicago with the highest number of followers:")
print(top_5_logins)


Top 5 users in Chicago with the highest number of followers:
cassidoo, felangel, dabeaz, sstephenson, mattgodbolt


In [6]:
# Convert 'created_at' to datetime format for accurate sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort users by 'created_at' in ascending order and get the top 5
earliest_users = users_df.sort_values(by='created_at').head(5)

# Extract their login names and display them as comma-separated
earliest_logins = ', '.join(earliest_users['login'].tolist())
print("5 earliest registered GitHub users in Chicago:")
print(earliest_logins)


5 earliest registered GitHub users in Chicago:
ELLIOTTCABLE, trevorturk, lukehoersten, djspiewak, shanesveller


In [7]:
# Filter out rows where license_name is missing
licenses_df = repos_df[repos_df['license_name'] != 'No License']

# Get the top 3 most popular licenses
top_3_licenses = licenses_df['license_name'].value_counts().head(3)

# Extract license names and display them as comma-separated
popular_licenses = ', '.join(top_3_licenses.index.tolist())
print("3 most popular licenses among users in Chicago:")
print(popular_licenses)


3 most popular licenses among users in Chicago:
mit, other, apache-2.0


In [13]:
def clean_company_name(company):
    if isinstance(company, str):  # Check if company is a string
        return company.strip().lstrip('@').upper()  # Clean up the name
    return ''  # Return an empty string if company is NaN or not a string


In [14]:
# Step 1: Update clean_company_name function
def clean_company_name(company):
    if isinstance(company, str):  # Check if company is a string
        return company.strip().lstrip('@').upper()  # Clean up the name
    return ''  # Return an empty string if company is NaN or not a string

# Step 2: Clean the company names
users_df['cleaned_company'] = users_df['company'].apply(clean_company_name)

# Step 3: Count the occurrences of each cleaned company name
company_counts = users_df['cleaned_company'].value_counts()

# Step 4: Identify the company with the highest count
if not company_counts.empty:
    top_company = company_counts.idxmax()
    top_company_count = company_counts.max()
    print(f"The company that the majority of these developers work at is: {top_company} with {top_company_count} developers.")
else:
    print("No companies found.")


The company that the majority of these developers work at is:  with 158 developers.


In [15]:
# Step 1: Update clean_company_name function
def clean_company_name(company):
    if isinstance(company, str):  # Check if company is a string
        return company.strip().lstrip('@').upper()  # Clean up the name
    return ''  # Return an empty string if company is NaN or not a string

# Step 2: Clean the company names
users_df['cleaned_company'] = users_df['company'].apply(clean_company_name)

# Step 3: Count the occurrences of each cleaned company name
company_counts = users_df['cleaned_company'].value_counts()

# Step 4: Get the second largest company
if company_counts.size >= 2:  # Ensure there are at least 2 companies
    second_largest_company = company_counts.index[1]  # Get the second largest company
    second_largest_count = company_counts.values[1]  # Get the count for the second largest company
    print(f"The second largest company is: {second_largest_company} with {second_largest_count} developers.")
else:
    print("Not enough companies found.")


The second largest company is: UNIVERSITY OF CHICAGO with 12 developers.


In [16]:
# Step 1: Count the occurrences of each programming language
language_counts = repos_df['language'].value_counts()

# Step 2: Get the most popular programming language
if not language_counts.empty:
    most_popular_language = language_counts.idxmax()  # Get the most popular language
    most_popular_count = language_counts.max()  # Get the count for the most popular language
    print(f"The most popular programming language among these users is: {most_popular_language} with {most_popular_count} repositories.")
else:
    print("No programming languages found.")


The most popular programming language among these users is: JavaScript with 5510 repositories.


In [17]:
# Step 1: Convert 'created_at' to datetime format for filtering users
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Step 2: Filter users who joined after 2020
new_users = users_df[users_df['created_at'] > '2020-01-01']

# Step 3: Gather their repositories
# Get a list of logins for users who joined after 2020
new_user_logins = new_users['login'].tolist()

# Filter repos_df for repositories owned by these new users
new_user_repos = repos_df[repos_df['login'].isin(new_user_logins)]

# Step 4: Count the occurrences of each programming language
language_counts = new_user_repos['language'].value_counts()

# Step 5: Get the second most popular programming language
if language_counts.size >= 2:  # Ensure there are at least 2 languages
    second_most_popular_language = language_counts.index[1]  # Get the second most popular language
    second_most_popular_count = language_counts.values[1]  # Get the count for the second most popular language
    print(f"The second most popular programming language among users who joined after 2020 is: {second_most_popular_language} with {second_most_popular_count} repositories.")
else:
    print("Not enough programming languages found.")


The second most popular programming language among users who joined after 2020 is: JavaScript with 77 repositories.


In [18]:
# Step 1: Group the repositories by language and calculate the average stars
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Step 2: Identify the language with the highest average stars
if not average_stars_per_language.empty:
    highest_average_language = average_stars_per_language.idxmax()  # Get the language with the highest average
    highest_average_count = average_stars_per_language.max()  # Get the average star count for that language
    print(f"The programming language with the highest average number of stars per repository is: {highest_average_language} with an average of {highest_average_count:.2f} stars.")
else:
    print("No programming languages found.")


The programming language with the highest average number of stars per repository is: Vim Script with an average of 646.41 stars.


In [19]:
# Step 1: Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Step 2: Sort the users by leader_strength in descending order and get the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Step 3: List the logins of the top 5 users
top_leader_logins = top_leaders['login'].tolist()

# Print the result
print("Top 5 users in terms of leader_strength are:", ', '.join(top_leader_logins))


Top 5 users in terms of leader_strength are: dabeaz, sstephenson, khan4019, adashofdata, djspiewak


In [20]:
# Step 1: Calculate the correlation between followers and public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])

# Step 2: Print the correlation to 3 decimal places
print(f"The correlation between the number of followers and the number of public repositories is: {correlation:.3f}")


The correlation between the number of followers and the number of public repositories is: 0.082


In [21]:
import statsmodels.api as sm

# Step 1: Define the independent (X) and dependent (y) variables
X = users_df['public_repos']  # Independent variable
y = users_df['followers']      # Dependent variable

# Step 2: Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Step 3: Fit the regression model
model = sm.OLS(y, X).fit()

# Step 4: Get the slope for public_repos (which is the second parameter in the model)
slope = model.params['public_repos']

# Step 5: Print the slope to 3 decimal places
print(f"The estimated additional followers per additional public repository is: {slope:.3f}")


The estimated additional followers per additional public repository is: 0.677


In [22]:
# Step 1: Convert boolean columns to numeric for correlation calculation
# Assuming has_projects and has_wiki are boolean values (True/False), we convert them to integers (1/0)
repos_df['has_projects_numeric'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki_numeric'] = repos_df['has_wiki'].astype(int)

# Step 2: Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects_numeric'].corr(repos_df['has_wiki_numeric'])

# Step 3: Print the correlation to 3 decimal places
print(f"The correlation between having projects enabled and having wiki enabled is: {correlation:.3f}")


The correlation between having projects enabled and having wiki enabled is: 0.287


In [23]:
# Step 1: Calculate the average following for hireable users (hireable = True)
average_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Step 2: Calculate the average following for non-hireable users (hireable = False)
average_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()

# Step 3: Calculate the difference
average_difference = average_following_hireable - average_following_non_hireable

# Step 4: Print the result to 3 decimal places
print(f"The average difference in following between hireable and non-hireable users is: {average_difference:.3f}")


The average difference in following between hireable and non-hireable users is: nan


In [24]:
# Step 1: Calculate the length of each bio in Unicode characters
users_df['bio_length'] = users_df['bio'].str.len()

# Step 2: Filter out users without bios
users_with_bios = users_df[users_df['bio_length'] > 0]

# Step 3: Calculate the correlation between bio length and followers
correlation = users_with_bios['bio_length'].corr(users_with_bios['followers'])

# Step 4: Print the correlation to 3 decimal places
print(f"The correlation between the length of bios and followers is: {correlation:.3f}")


The correlation between the length of bios and followers is: 0.012


In [25]:
# Step 1: Convert the 'created_at' column to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Step 2: Create a new column for the day of the week (0=Monday, 6=Sunday)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

# Step 3: Filter for weekend days (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

# Step 4: Count the number of weekend repositories for each user
weekend_counts = weekend_repos['login'].value_counts()

# Step 5: Get the top 5 users with the most weekend repositories
top_5_users = weekend_counts.head(5).index.tolist()

# Step 6: Print the top 5 users' logins in order, comma-separated
print("Top 5 users with the most repositories created on weekends:", ", ".join(top_5_users))


Top 5 users with the most repositories created on weekends: marwahaha, austinsonger, eddelbuettel, sabre1041, yyolk


In [26]:
# Step 1: Calculate the fraction of hireable users with email addresses
hireable_users_with_email = users_df[users_df['hireable'] == True]['email'].notnull().sum()
total_hireable_users = users_df['hireable'].value_counts().get(True, 0)  # Get the count of hireable users

if total_hireable_users > 0:
    fraction_hireable_with_email = hireable_users_with_email / total_hireable_users
else:
    fraction_hireable_with_email = 0

# Step 2: Calculate the fraction of non-hireable users with email addresses
non_hireable_users_with_email = users_df[users_df['hireable'] == False]['email'].notnull().sum()
total_non_hireable_users = users_df['hireable'].value_counts().get(False, 0)  # Get the count of non-hireable users

if total_non_hireable_users > 0:
    fraction_non_hireable_with_email = non_hireable_users_with_email / total_non_hireable_users
else:
    fraction_non_hireable_with_email = 0

# Step 3: Calculate the difference
email_fraction_difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Step 4: Print the result to 3 decimal places
print(f"The difference in fractions of users with email addresses is: {email_fraction_difference:.3f}")


The difference in fractions of users with email addresses is: 0.527


In [27]:
# Step 1: Extract surnames, ignore missing names, and trim whitespace
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])

# Step 2: Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Step 3: Identify the most common surname(s)
most_common_surnames = surname_counts[surname_counts == surname_counts.max()]

# Step 4: Get the names and the count of users with the most common surname
common_surnames_list = most_common_surnames.index.tolist()
number_of_users_with_most_common_surname = most_common_surnames.max()

# Step 5: Sort surnames alphabetically
common_surnames_list.sort()

# Step 6: Print results
print(f"The most common surname(s): {', '.join(common_surnames_list)}")
print(f"Number of users with the most common surname: {number_of_users_with_most_common_surname}")


The most common surname(s): Baker, Chen, Chu, Fuller, James, Jones, King, LLC, Miller, Olsen, Sadykov, Smith, Turk, Xavier, Zhang
Number of users with the most common surname: 2


In [28]:
# Step 1: Calculate the average number of following for hireable users
hireable_following_avg = users_df[users_df['hireable'] == True]['following'].mean()

# Step 2: Calculate the average number of following for non-hireable users
non_hireable_following_avg = users_df[users_df['hireable'] == False]['following'].mean()

# Step 3: Calculate the difference between the averages
difference = hireable_following_avg - non_hireable_following_avg

# Step 4: Print the result to 3 decimal places
print(f"Average following for hireable users minus non-hireable users: {difference:.3f}")


Average following for hireable users minus non-hireable users: nan


In [29]:
# Step 1: Calculate the average number of following for hireable users
hireable_following_avg = users_df[users_df['hireable'] == True]['following'].mean()

# Step 2: Calculate the average number of following for non-hireable users
non_hireable_following_avg = users_df[users_df['hireable'] == False]['following'].mean()

# Step 3: Replace NaN with 0 for the averages if no users exist in that category
if pd.isna(hireable_following_avg):
    hireable_following_avg = 0

if pd.isna(non_hireable_following_avg):
    non_hireable_following_avg = 0

# Step 4: Calculate the difference between the averages
difference = hireable_following_avg - non_hireable_following_avg

# Step 5: Print the result to 3 decimal places
print(f"Average following for hireable users minus non-hireable users: {difference:.3f}")


Average following for hireable users minus non-hireable users: 210.714
