In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

In [2]:
tocken="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"

In [5]:
# Step 0: GitHub API token (keep this secure, don't hardcode in production)
GITHUB_TOKEN = tocken
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

# Step 1: Define the base URL for the search query
url = "https://api.github.com/search/users"
params = {
    "q": "location:Dublin followers:>50",
    "per_page": 30,  # Adjust as needed (max 100 per page)
    "page": 1
}

# Step 2: Initialize an empty list to store user data
users_data = []

# Step 3: Paginate through results until no more pages
while True:
    response = requests.get(url, headers=headers, params=params)
    data = response.json()

    # Error handling if the request fails
    if response.status_code != 200:
        print(f"Error: {response.status_code}, {data}")
        break

    # Step 4: Process each user in the current page
    for user in data.get("items", []):
        # Get detailed user info (to access fields like 'name', 'company')
        user_response = requests.get(user['url'], headers=headers)
        user_info = user_response.json()

        # Step 5: Append necessary data to users_data list
        users_data.append({
            "login": user_info.get("login", ""),
            "name": user_info.get("name", ""),
            "company": user_info.get("company", "").strip("@ ").upper() if user_info.get("company") else "",
            "location": user_info.get("location", ""),
            "email": user_info.get("email", ""),
            "hireable": user_info.get("hireable", ""),
            "bio": user_info.get("bio", ""),
            "public_repos": user_info.get("public_repos", 0),
            "followers": user_info.get("followers", 0),
            "following": user_info.get("following", 0),
            "created_at": user_info.get("created_at", "")
        })

    # Pagination: check if there's another page
    if 'next' in response.links:
        params['page'] += 1
    else:
        break

# Step 6: Save data to users.csv using pandas
df = pd.DataFrame(users_data)
df.to_csv("users.csv", index=False)
print("Data saved to users.csv")

Data saved to users.csv


In [6]:
# Load the users data
users_df = pd.read_csv("users.csv")

# Initialize an empty list to store repository data
repositories_data = []

# Loop through each user to fetch their repositories
for index, row in users_df.iterrows():
    user_login = row['login']

    # Fetch repositories for each user
    repo_url = f"https://api.github.com/users/{user_login}/repos"
    params = {
        "per_page": 100,  # Max is 100 per page
        "page": 1
    }

    while True:
        response = requests.get(repo_url, headers=headers, params=params)
        repos_data = response.json()

        if response.status_code != 200:
            print(f"Error fetching repos for {user_login}: {response.status_code}, {repos_data}")
            break

        for repo in repos_data:
            repositories_data.append({
                "login": user_login,  # User login from users.csv
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("name", "") if repo.get("license") else ""
            })

        # Check if there are more pages
        if 'next' in response.links:
            params['page'] += 1  # Increment page number
        else:
            break  # Exit loop if no more pages

# Convert the repository data to a DataFrame
repos_df = pd.DataFrame(repositories_data)

# Save the repositories data to repositories.csv
repos_df.to_csv("repositories.csv", index=False)
print("Data saved to repositories.csv")

Data saved to repositories.csv


In [10]:
# Load the data from CSV files
users_df = pd.read_csv("users.csv")
# Display the first few rows of each DataFrame to confirm they loaded correctly
print(users_df.head())

                  login                    name  \
0                  orta             Orta Therox   
1         jeromeetienne          Jerome Etienne   
2            jonataslaw            Jonny Borges   
3  steventroughtonsmith  Steven Troughton-Smith   
4                  axic        Alex Beregszaszi   

                                   company  \
0                                      NaN   
1                   MAKING WEBAR A REALITY   
2                                     IRIS   
3                    HIGH CAFFEINE CONTENT   
4  ETHEREUM @IPSILON @SPEARBIT @ETHEREUMJS   

                                       location                     email  \
0  Huddersfield / NYC / Dublin / Rio de Janeiro               git@orta.io   
1                               Dublin, Ireland  jerome.etienne@gmail.com   
2                               Dublin, Ireland                       NaN   
3                               Dublin, Ireland                       NaN   
4                              

In [11]:
repos_df = pd.read_csv("repositories.csv")
print(repos_df.head())

  login                     full_name            created_at  stargazers_count  \
0  orta       orta/11ty-twoslash-test  2020-08-23T10:48:14Z                 1   
1  orta                    orta/31465  2020-03-24T21:00:15Z                 2   
2  orta                    orta/31859  2020-07-14T19:02:11Z                 0   
3  orta                        orta/a  2018-01-30T15:41:33Z                 1   
4  orta  orta/ABetterPlaceForTweetbot  2021-12-13T05:37:20Z                 0   

   watchers_count    language  has_projects  has_wiki license_name  
0               1        HTML          True      True          NaN  
1               2  TypeScript          True      True          NaN  
2               0  TypeScript          True      True          NaN  
3               1  TypeScript          True      True  MIT License  
4               0         NaN          True      True  MIT License  


In [13]:
top_users = users_df.nlargest(5, 'followers')[['login', 'followers']]
top_users_list = top_users['login'].tolist()
print("Top 5 users by followers:", ','.join(top_users_list))

Top 5 users by followers: orta,jeromeetienne,jonataslaw,steventroughtonsmith,axic


In [16]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'], errors='coerce')
earliest_users = users_df.nsmallest(5, 'created_at')[['login', 'created_at']]
earliest_users_list = earliest_users['login'].tolist()
print("Earliest registered users:", ','.join(earliest_users_list))

Earliest registered users: paulca,adrian,GavinJoyce,amir,ciaranlee


In [15]:
popular_licenses = repos_df['license_name'].value_counts().head(3)
popular_licenses_list = popular_licenses.index.tolist()
print("Most popular licenses:", ','.join(popular_licenses_list))

Most popular licenses: MIT License,Apache License 2.0,Other


In [17]:
majority_company = users_df['company'].mode()[0]  # Mode will give the most common value
print("Company with the majority of developers:", majority_company)

Company with the majority of developers: AWS


In [18]:
popular_language = repos_df['language'].mode()[0]  # Mode will give the most common language
print("Most popular programming language:", popular_language)

Most popular programming language: JavaScript


In [24]:
recent_users = users_df[users_df['created_at'] > '2020-01-01']
second_popular_language = repos_df[repos_df['login'].isin(recent_users['login'])]['language'].value_counts().nlargest(2).index[-1]
print("Second most popular language after 2020:", second_popular_language)

Second most popular language after 2020: JavaScript


In [25]:
average_stars = repos_df.groupby('language')['stargazers_count'].mean().nlargest(1)
highest_average_stars_language = average_stars.index[0]
print("Language with highest average stars:", highest_average_stars_language)

Language with highest average stars: MDX


In [26]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leaders = users_df.nlargest(5, 'leader_strength')[['login', 'leader_strength']]
top_leaders_list = top_leaders['login'].tolist()
print("Top 5 users by leader strength:", ','.join(top_leaders_list))

Top 5 users by leader strength: flaviohenriquealmeida,zalando,AnikSarker,wix,CardinalHealth


In [27]:
correlation = users_df['followers'].corr(users_df['public_repos'])
print("Correlation between followers and public repositories:", round(correlation, 3))

Correlation between followers and public repositories: 0.555


In [28]:
import statsmodels.api as sm
X = users_df['public_repos']
y = users_df['followers']
X = sm.add_constant(X)  # Add constant for intercept
model = sm.OLS(y, X).fit()
regression_slope = model.params[1]  # Get slope for public_repos
print("Regression slope of followers on repos:", round(regression_slope, 3))

Regression slope of followers on repos: 2.825


  regression_slope = model.params[1]  # Get slope for public_repos


In [29]:
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Correlation between projects enabled and wiki:", round(correlation_projects_wiki, 3))

Correlation between projects enabled and wiki: 0.315


In [38]:
# Replace NaN values in 'hireable' with False
users_df['hireable'] = users_df['hireable'].fillna(False)
# Check the counts of hireable and non-hireable users
hireable_count = users_df['hireable'].value_counts().get(True, 0)
non_hireable_count = users_df['hireable'].value_counts().get(False, 0)
print("Counts of hireable users:", hireable_count)
print("Counts of non-hireable users:", non_hireable_count)
# Drop rows where 'following' is NaN
users_df_cleaned = users_df.dropna(subset=['following'])
# Calculate the average following for hireable and non-hireable users
hireable_avg_following = users_df_cleaned[users_df_cleaned['hireable'] == True]['following'].mean()
non_hireable_avg_following = users_df_cleaned[users_df_cleaned['hireable'] == False]['following'].mean()
# Print average following values
print(f"Hireable average following: {hireable_avg_following}, Non-hireable average following: {non_hireable_avg_following}")
# Calculate the difference
if non_hireable_count > 0:  # Check if there are non-hireable users
    average_following_difference = round(hireable_avg_following - non_hireable_avg_following, 3)
    print("Average following difference for hireable users:", average_following_difference)
else:
    print("No non-hireable users to compare with.")

Counts of hireable users: 182
Counts of non-hireable users: 295
Hireable average following: 112.81318681318682, Non-hireable average following: 65.47796610169492
Average following difference for hireable users: 47.335


  users_df['hireable'] = users_df['hireable'].fillna(False)


In [35]:
users_df['bio_length'] = users_df['bio'].str.split().str.len().fillna(0)
cleaned_df = users_df.dropna(subset=['followers', 'bio_length'])
regression_model_bio = sm.OLS(cleaned_df['followers'], sm.add_constant(cleaned_df['bio_length'])).fit()
bio_correlation_slope = round(regression_model_bio.params[1], 3)
print("Regression slope of followers on bio word count:", bio_correlation_slope)

Regression slope of followers on bio word count: 6.903


  bio_correlation_slope = round(regression_model_bio.params[1], 3)


In [32]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])  # Convert to datetime
repos_df['weekday'] = repos_df['created_at'].dt.day_name()  # Get day name
weekend_users = repos_df[repos_df['weekday'].isin(['Saturday', 'Sunday'])].groupby('login').size().nlargest(5)
top_weekend_users_list = weekend_users.index.tolist()
print("Top 5 users who created most repos on weekends:", ','.join(top_weekend_users_list))

Top 5 users who created most repos on weekends: orta,joshuacassidy,No9,wafuwafu13,lmammino


In [40]:
# Fill NaN values in the 'hireable' column with False
users_df['hireable'] = users_df['hireable'].fillna(False)
# Calculate the proportion of hireable and non-hireable users who share their email
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notnull().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notnull().mean()
# Print the proportions
print(f"Proportion of hireable users with email: {hireable_with_email}, "
      f"Proportion of non-hireable users with email: {non_hireable_with_email}")
# Calculate the difference
email_difference = round(hireable_with_email - non_hireable_with_email, 3)
print("Email sharing difference for hireable users:", email_difference)

Proportion of hireable users with email: 0.5604395604395604, Proportion of non-hireable users with email: 0.45084745762711864
Email sharing difference for hireable users: 0.11


In [34]:
users_df['surname'] = users_df['name'].str.split().str[-1]  # Get last word as surname
most_common_surname = users_df['surname'].mode()
print("Most common surname(s):", ','.join(most_common_surname))

Most common surname(s): Chen,Kenny,O'Sullivan,Quinn
