In [None]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initialize the GitHub scraper with your API token.

        Args:
            token (str): GitHub Personal Access Token
        """
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str, params: dict = None) -> Dict:
        """
        Make a request to the GitHub API with rate limit handling.
        """
        while True:
            response = requests.get(url, headers=self.headers, params=params)

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                self.logger.warning(f"Rate limit hit. Sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                self.logger.error(f"Error {response.status_code}: {response.text}")
                response.raise_for_status()

    def clean_company_name(self, company: str) -> str:
        """
        Clean up company names according to specifications.
        """
        if not company:
            return ""

        # Strip whitespace and @ symbol
        cleaned = company.strip().lstrip('@')

        # Convert to uppercase
        return cleaned.upper()

    def search_users(self, location: str, min_followers: int) -> List[Dict]:
        """
        Search for GitHub users in a specific location with minimum followers.
        """
        users = []
        page = 1

        while True:
            self.logger.info(f"Fetching users page {page}")

            query = f"location:{location} followers:>={min_followers}"
            params = {
                'q': query,
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/search/users"
            response = self._make_request(url, params)

            if not response['items']:
                break

            for user in response['items']:
                user_data = self._make_request(user['url'])

                # Extract only the required fields with exact matching names
                cleaned_data = {
                    'login': user_data['login'],
                    'name': user_data['name'] if user_data['name'] else "",
                    'company': self.clean_company_name(user_data.get('company')),
                    'location': user_data['location'] if user_data['location'] else "",
                    'email': user_data['email'] if user_data['email'] else "",
                    'hireable': str(user_data['hireable']).lower() if user_data['hireable'] is not None else "false",
                    'bio': user_data['bio'] if user_data['bio'] else "",
                    'public_repos': user_data['public_repos'],
                    'followers': user_data['followers'],
                    'following': user_data['following'],
                    'created_at': user_data['created_at']
                }

                users.append(cleaned_data)

            page += 1

        return users

    def get_user_repositories(self, username: str, max_repos: int = 500) -> List[Dict]:
        """
        Get repositories for a specific user.
        """
        repos = []
        page = 1

        while len(repos) < max_repos:
            self.logger.info(f"Fetching repositories for {username}, page {page}")

            params = {
                'sort': 'pushed',
                'direction': 'desc',
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/users/{username}/repos"
            response = self._make_request(url, params)

            if not response:
                break

            for repo in response:
                # Extract only the required fields with exact matching names
                repo_data = {
                    'login': username,  # Adding owner's login as required
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'] if repo['language'] else "",
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['key'] if repo.get('license') else ""
                }

                repos.append(repo_data)

            if len(response) < 100:
                break

            page += 1

        return repos[:max_repos]

def main():
    # Get GitHub token
    token = input("Enter your GitHub token: ").strip()
    if not token:
        print("Token is required. Exiting...")
        return

    # Initialize scraper
    scraper = GitHubScraper(token)

    # Search for users in London with >500 followers
    users = scraper.search_users(location='London', min_followers=500)

    # Save users to CSV
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Get repositories for each user
    all_repos = []
    for user in users:
        repos = scraper.get_user_repositories(user['login'])
        all_repos.extend(repos)

    # Save repositories to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)

    print(f"Scraped {len(users)} users and {len(all_repos)} repositories")

    # Create README.md
    with open('README.md', 'w') as f:
        f.write(f"""# GitHub Users in London

This repository contains data about GitHub users in London with over 500 followers and their repositories.

## Files

1. `users.csv`: Contains information about {len(users)} GitHub users in London with over 500 followers
2. `repositories.csv`: Contains information about {len(all_repos)} public repositories from these users
3. `gitscrap.py`: Python script used to collect this data

## Data Collection

- Data collected using GitHub API
- Date of collection: {time.strftime('%Y-%m-%d')}
- Only included users with 500+ followers
- Up to 500 most recently pushed repositories per user
""")

if __name__ == "__main__":
    main()

Enter your GitHub token: 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
Scraped 326 users and 39374 repositories


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].astype(int)  # Convert True/False to 1/0
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)          # Convert True/False to 1/0

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  
Correlation between projects enabled and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill blank (NaN) values with 0 in 'has_projects' and 'has_wiki'
repos_df['has_projects'] = repos_df['has_projects'].fillna(0)
repos_df['has_wiki'] = repos_df['has_wiki'].fillna(0)

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].astype(int)  # Convert True/False to 1/0
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)          # Convert True/False to 1/0

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)



  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  
Correlation between projects enabled and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill NaN values in the entire DataFrame with 0
repos_df.fillna(0, inplace=True)

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)

# Create a Pearson correlation table for the entire DataFrame
correlation_matrix = repos_df.corr(method='pearson')

# Display the correlation matrix
print("Pearson Correlation Matrix:\n", correlation_matrix)

# Extract the correlation between 'has_projects' and 'has_wiki'
correlation_projects_wiki = correlation_matrix.loc['has_projects', 'has_wiki']

# Round the correlation to 3 decimal places
correlation_projects_wiki_rounded = round(correlation_projects_wiki, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_projects_wiki_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  


ValueError: could not convert string to float: 'tj'

In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill NaN values in the entire DataFrame with 0
repos_df.fillna(0, inplace=True)

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})

# Ensure all columns are numeric for correlation calculation
# Select only numeric columns
numeric_df = repos_df.select_dtypes(include=['int', 'float'])

# Create a Pearson correlation table for the numeric DataFrame
correlation_matrix = numeric_df.corr(method='pearson')

# Display the correlation matrix
print("Pearson Correlation Matrix:\n", correlation_matrix)

# Extract the correlation between 'has_projects' and 'has_wiki'
correlation_projects_wiki = correlation_matrix.loc['has_projects', 'has_wiki']

# Round the correlation to 3 decimal places
correlation_projects_wiki_rounded = round(correlation_projects_wiki, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_projects_wiki_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  
Pearson Correlation Matrix:
                   stargazers_count  watchers_count  has_projects  has_wiki
stargazers_count   

  repos_df['has_projects'] = repos_df['has_projects'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})
  repos_df['has_wiki'] = repos_df['has_wiki'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill NaN values in 'has_projects' and 'has_wiki' columns with 0
repos_df['has_projects'].fillna(0, inplace=True)
repos_df['has_wiki'].fillna(0, inplace=True)

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_projects'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_wiki'].fillna(0, inplace=True)
  repos_df['has_projects'] = repos_df['has_projects'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})


Correlation between projects enabled and wiki enabled: 0.446


  repos_df['has_wiki'] = repos_df['has_wiki'].replace({True: 1, False: 0, 'True': 1, 'False': 0, '': 0})


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill NaN values in 'has_projects' and 'has_wiki' columns with 0
repos_df['has_projects'] = repos_df['has_projects'].fillna(0)
repos_df['has_wiki'] = repos_df['has_wiki'].fillna(0)

# Convert 'has_projects' and 'has_wiki' to integers (1 for True, 0 for False)
repos_df['has_projects'] = repos_df['has_projects'].replace(
    {True: 1, False: 0, 'True': 1, 'False': 0, '': 0}).astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].replace(
    {True: 1, False: 0, 'True': 1, 'False': 0, '': 0}).astype(int)

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  
Correlation between projects enabled and wiki enabled: 0.446


  repos_df['has_projects'] = repos_df['has_projects'].replace(
  repos_df['has_wiki'] = repos_df['has_wiki'].replace(


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Check the first few rows to ensure it's loaded correctly
print(repos_df.head())

# Fill blank values with 0 for has_projects and has_wiki
repos_df['has_projects'].fillna(0, inplace=True)
repos_df['has_wiki'].fillna(0, inplace=True)

# Convert 'has_projects' and 'has_wiki' to integers (1 for true, 0 for false)
repos_df['has_projects'] = repos_df['has_projects'].replace({true: 1, false: 0, 'true': 1, 'false': 0, '': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({true: 1, false: 0, 'true': 1, 'false': 0, '': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)


  login                 full_name            created_at  stargazers_count  \
0    tj  tj/node-cookie-signature  2012-10-15T15:54:33Z               185   
1    tj           tj/commander.js  2011-08-14T21:33:58Z             26742   
2    tj                      tj/n  2011-01-05T14:53:19Z             18863   
3    tj             tj/git-extras  2010-08-04T16:32:07Z             17327   
4    tj           tj/node-migrate  2011-04-24T21:00:22Z              1545   

   watchers_count    language  has_projects  has_wiki license_name  
0             185  JavaScript          True      True          mit  
1           26742  JavaScript          True      True          mit  
2           18863       Shell          True      True          mit  
3           17327       Shell          True      True          mit  
4            1545  JavaScript          True      True          mit  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_projects'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_wiki'].fillna(0, inplace=True)


NameError: name 'true' is not defined

In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Adjust the path if necessary

# Fill NaN values with empty strings for 'has_projects' and 'has_wiki'
repos_df['has_projects'].fillna('', inplace=True)
repos_df['has_wiki'].fillna('', inplace=True)

# Replace boolean values with 'true', 'false', and empty strings
repos_df['has_projects'] = repos_df['has_projects'].replace({True: 'true', False: 'false', 'True': 'true', 'False': 'false', '': 'false'})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({True: 'true', False: 'false', 'True': 'true', 'False': 'false', '': 'false'})

# Convert to integer (1 for true, 0 for false)
repos_df['has_projects'] = repos_df['has_projects'].map({'true': 1, 'false': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': 1, 'false': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Round the correlation to 3 decimal places
correlation_rounded = round(correlation, 3)

print("Correlation between projects enabled and wiki enabled:", correlation_rounded)


Correlation between projects enabled and wiki enabled: 0.446


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_projects'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['has_wiki'].fillna('', inplace=True)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load the users.csv file (make sure the path is correct)
users_df = pd.read_csv('users.csv')  # Update with your actual path

# Filter out users without bios
filtered_df = users_df[users_df['bio'].notna()]

# Calculate the length of each bio in terms of word count
filtered_df['bio_word_count'] = filtered_df['bio'].str.split().str.len()

# Prepare the data for regression (only include relevant columns)
X = filtered_df['bio_word_count']
y = filtered_df['followers']

# Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient) for bio word count
slope = model.params['bio_word_count']

# Round the slope to 3 decimal places
slope_rounded = round(slope, 3)

print("Regression slope of followers on bio word count:", slope_rounded)


Regression slope of followers on bio word count: 1.995


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['bio_word_count'] = filtered_df['bio'].str.split().str.len()


In [None]:
import pandas as pd

# Load the repositories.csv file
repos_df = pd.read_csv('repositories.csv')  # Update the path as necessary

# Convert the created_at column to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend entries (Saturday: 5, Sunday: 6)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos_df = repos_df[(repos_df['day_of_week'] == 5) | (repos_df['day_of_week'] == 6)]

# Group by login and count the number of repositories created
user_repo_counts = weekend_repos_df.groupby('login').size().reset_index(name='repo_count')

# Sort by repo_count in descending order and get the top 5
top_users = user_repo_counts.sort_values(by='repo_count', ascending=False).head(5)

# Get the list of top 5 users' logins in order, comma-separated
top_users_logins = ', '.join(top_users['login'])

print("Top 5 users who created the most repositories on weekends (UTC):", top_users_logins)



Top 5 users who created the most repositories on weekends (UTC): praveenscience, mattdesl, passy, CodeMaster7000, UKVeteran


In [None]:
import pandas as pd

# Load the users.csv file (update the path if needed)
users_df = pd.read_csv('users.csv')

# Filter for hireable users and calculate fraction with email
hireable_users = users_df[users_df['hireable'] == True]
fraction_hireable_with_email = hireable_users['email'].notna().mean()

# Filter for non-hireable users and calculate fraction with email
non_hireable_users = users_df[users_df['hireable'] == False]
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

# Calculate the difference, rounded to 3 decimal places
difference = round(fraction_hireable_with_email - fraction_non_hireable_with_email, 3)

print("Difference in email sharing between hireable and non-hireable users:", difference)


Difference in email sharing between hireable and non-hireable users: 0.061


In [None]:
import pandas as pd

# Load the users.csv file (adjust the path if needed)
users_df = pd.read_csv('users.csv')

# Filter out rows with missing names
users_with_names = users_df[users_df['name'].notna()]

# Extract surnames (last word in the name column after trimming)
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count, sort them alphabetically
most_common_surnames = surname_counts[surname_counts == max_count].index.sort_values()

# Join them into a comma-separated string
most_common_surnames_list = ', '.join(most_common_surnames)

print("Most common surname(s):", most_common_surnames_list)


Most common surname(s): Appleton, Brewery, Fuller, Greenfeld, Jackson, Li, Williams


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]


In [None]:
import pandas as pd

# Load the users.csv file (adjust the path if needed)
users_df = pd.read_csv('users.csv')

# Filter out rows with missing names
users_with_names = users_df[users_df['name'].notna()]

# Extract surnames (last word in the name column after trimming)
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count, sort them alphabetically
most_common_surnames = surname_counts[surname_counts == max_count].index.sort_values()

# Join them into a comma-separated string
most_common_surnames_list = ', '.join(most_common_surnames)

print("Most common surname(s):", most_common_surnames_list)


Most common surname(s): Appleton, Brewery, Fuller, Greenfeld, Jackson, Li, Williams


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]


In [None]:
import pandas as pd

# Load the users.csv file (adjust the path if needed)
users_df = pd.read_csv('users.csv')

# Calculate the average following count for hireable users
hireable_avg_following = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average following count for non-hireable users
non_hireable_avg_following = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference, rounded to 3 decimal places
difference = round(hireable_avg_following - non_hireable_avg_following, 3)

print("Difference in average following between hireable and non-hireable users:", difference)


Difference in average following between hireable and non-hireable users: 1082.745


In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Calculate the correlation between has_projects and has_wiki, rounded to 3 decimal places
correlation = round(repos_df['has_wiki'].astype(int).corr(repos_df['has_projects'].astype(int)), 3)

print("Correlation between projects and wiki enabled:", correlation)



Correlation between projects and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Clean the has_projects and has_wiki columns by removing spaces and converting to booleans
repos_df['has_projects'] = repos_df['has_projects'].str.strip().map(lambda x: x == "True")
repos_df['has_wiki'] = repos_df['has_wiki'].str.strip().map(lambda x: x == "True")

# Calculate the correlation between has_projects and has_wiki, rounded to 3 decimal places
correlation = round(repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int)), 3)

print("Correlation between projects and wiki enabled:", correlation)


AttributeError: Can only use .str accessor with string values!

In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Ensure the columns are strings, then strip spaces and convert to booleans
repos_df['has_projects'] = repos_df['has_projects'].astype(str).str.strip().map(lambda x: x == "True")
repos_df['has_wiki'] = repos_df['has_wiki'].astype(str).str.strip().map(lambda x: x == "True")

# Calculate the correlation between has_projects and has_wiki, rounded to 3 decimal places
correlation = round(repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int)), 3)

print("Correlation between projects and wiki enabled:", correlation)


Correlation between projects and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Define a function to clean and convert the values to booleans, treating blanks as False
def clean_boolean(value):
    # Check for NaN or empty strings and return False for them
    if pd.isna(value) or str(value).strip() == "":
        return False
    # Otherwise, return True if the cleaned value is exactly "True"
    return str(value).strip() == "True"

# Apply the cleaning function to has_projects and has_wiki columns
repos_df['has_projects'] = repos_df['has_projects'].apply(clean_boolean)
repos_df['has_wiki'] = repos_df['has_wiki'].apply(clean_boolean)

# Calculate the correlation between has_projects and has_wiki, rounded to 3 decimal places
correlation = round(repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int)), 3)

print("Correlation between projects and wiki enabled:", correlation)


Correlation between projects and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Define a function to clean and convert the values to booleans, treating blanks as False
def clean_boolean(value):
    # Treat blanks or NaNs as False
    return str(value).strip() == "True" if pd.notna(value) and str(value).strip() != "" else False

# Apply the cleaning function to has_projects and has_wiki columns
repos_df['has_projects'] = repos_df['has_projects'].apply(clean_boolean)
repos_df['has_wiki'] = repos_df['has_wiki'].apply(clean_boolean)

# Calculate the correlation between has_projects and has_wiki, rounded to 3 decimal places
correlation = round(repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int)), 3)

print("Correlation between projects and wiki enabled:", correlation)


Correlation between projects and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the repositories.csv file (adjust the path if necessary)
repos_df = pd.read_csv('repositories.csv')

# Define a function to clean and convert the values to booleans, treating blanks as False
def clean_boolean(value):
    # Treat blanks or NaNs as False
    return str(value).strip() == "True" if pd.notna(value) and str(value).strip() != "" else False

# Apply the cleaning function to has_projects and has_wiki columns
repos_df['has_projects'] = repos_df['has_projects'].apply(clean_boolean)
repos_df['has_wiki'] = repos_df['has_wiki'].apply(clean_boolean)

# Check if there's enough variability in data before calculating correlation
if repos_df['has_projects'].nunique() > 1 and repos_df['has_wiki'].nunique() > 1:
    # Calculate the correlation if there's variability
    correlation = round(repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int)), 3)
else:
    # Set correlation to None or a specific message if no variability
    correlation = "Not enough variability in data to calculate correlation"

print("Correlation between projects and wiki enabled:", correlation)



Correlation between projects and wiki enabled: 0.446


In [None]:
import pandas as pd

# Load the data
url = 'https://raw.githubusercontent.com/azh-py/london-github-users/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Clean the has_projects and has_wiki columns
repos_df['has_projects'] = repos_df['has_projects'].str.strip().map(lambda x: x == "True")  # Convert to boolean
repos_df['has_wiki'] = repos_df['has_wiki'].str.strip().map(lambda x: x == "True")  # Convert to boolean

# Treat NaN as False
repos_df['has_projects'] = repos_df['has_projects'].fillna(False)
repos_df['has_wiki'] = repos_df['has_wiki'].fillna(False)

# Calculate correlation
correlation = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print(f'Correlation between has_projects and has_wiki: {correlation:.3f}')


AttributeError: Can only use .str accessor with string values!

In [None]:
import pandas as pd

# Assuming your DataFrame is already loaded in a variable named repos_df
# If not, load it from the file like this:
# repos_df = pd.read_csv('path_to_your/repositories.csv')

# Fill blank (NaN) values in a specific column with 0
# Replace 'column_name' with the actual name of the column you want to fill
repos_df['column_name'] = repos_df['column_name'].fillna(0)

# Example: Fill NaN in the 'stargazers_count' column with 0
repos_df['stargazers_count'] = repos_df['stargazers_count'].fillna(0)

# Display the updated DataFrame
repos_df.head()  # Show the first few rows to verify changes


KeyError: 'column_name'

In [None]:
import pandas as pd

# Load the users.csv file from the GitHub repository
url = 'https://raw.githubusercontent.com/azh-py/london-github-users/main/users.csv'
users_df = pd.read_csv(url)

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Get the top 5 users based on leader_strength
top_leaders = users_df.nlargest(5, 'leader_strength')['login']

# Convert the top leaders to a comma-separated string
top_leaders_list = ', '.join(top_leaders)
top_leaders_list


'kunal-kushwaha, angelabauer, Elfocrash, LaravelDaily, cloudflare'

In [None]:
import pandas as pd

# Load the users.csv file from the GitHub repository
url = 'https://raw.githubusercontent.com/azh-py/london-github-users/main/users.csv'
users_df = pd.read_csv(url)

# Let's assume the users_df contains columns for 'projects_enabled' and 'wiki_enabled'
# Since we don't have this data directly, we'll create a mock DataFrame with the required structure.

# Sample Data (replace this with actual data if available)
data = {
    'repo_name': ['repo1', 'repo2', 'repo3', 'repo4', 'repo5', 'repo6'],
    'projects_enabled': [1, 1, 0, 1, 0, 0],  # 1 for enabled, 0 for disabled
    'wiki_enabled': [1, 0, 0, 1, 1, 0]  # 1 for enabled, 0 for disabled
}

# Create a DataFrame
repos_df = pd.DataFrame(data)

# Calculate the correlation between projects_enabled and wiki_enabled
correlation = repos_df['projects_enabled'].corr(repos_df['wiki_enabled'])

# Round to 3 decimal places
correlation_rounded = round(correlation, 3)
correlation_rounded



0.333

In [None]:
import requests
import pandas as pd
import time

# GitHub API settings
GITHUB_TOKEN = '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'  # Replace with your actual token
headers = {'Authorization': f'token {GITHUB_TOKEN}'}
base_url = "https://api.github.com"

# Function to clean the company names
def clean_company_name(company):
    if company:
        company = company.strip()  # Trim whitespace
        if company.startswith('@'):
            company = company[1:]  # Strip leading @ symbol
        return company.upper()  # Convert to uppercase
    return company

# Function to fetch users in London with over 500 followers
def get_users(city="London", min_followers=500, max_results=100):
    users_data = []
    page = 1
    while len(users_data) < max_results:
        users_url = f"{base_url}/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=30"
        response = requests.get(users_url, headers=headers)
        response.raise_for_status()
        users = response.json().get('items', [])

        if not users:  # Stop if there are no more users
            break

        for user in users:
            user_detail_url = user['url']
            user_detail_response = requests.get(user_detail_url, headers=headers)
            user_detail_response.raise_for_status()
            user_detail = user_detail_response.json()

            users_data.append({
                "login": user['login'],
                "name": user_detail.get('name', ''),
                "company": clean_company_name(user_detail.get('company')),
                "location": user_detail.get('location', ''),
                "email": user_detail.get('email', ''),
                "hireable": user_detail.get('hireable', ''),
                "bio": user_detail.get('bio', ''),
                "public_repos": user_detail.get('public_repos', 0),
                "followers": user_detail.get('followers', 0),
                "following": user_detail.get('following', 0),
                "created_at": user_detail.get('created_at', ''),
            })

        page += 1
        time.sleep(1)  # Sleep to respect API rate limits

    return users_data

# Function to fetch repositories for a given user
def get_repositories(user_login, max_results=500):
    repos_data = []
    page = 1

    while len(repos_data) < max_results:
        repos_url = f"{base_url}/users/{user_login}/repos?sort=pushed&direction=desc&page={page}&per_page=100"
        response = requests.get(repos_url, headers=headers)
        response.raise_for_status()
        repos = response.json()

        if not repos:  # Stop if there are no more repositories
            break

        for repo in repos:
            repos_data.append({
                "login": user_login,
                "full_name": repo['full_name'],
                "created_at": repo['created_at'],
                "stargazers_count": repo['stargazers_count'],
                "watchers_count": repo['watchers_count'],
                "language": repo['language'],
                "has_projects": repo['has_projects'],
                "has_wiki": repo['has_wiki'],
                "license_name": repo['license']['name'] if repo['license'] else None,
            })

        page += 1
        time.sleep(1)  # Sleep to respect API rate limits

    return repos_data

# Main execution function
def main():
    users = get_users()
    print(f"Found {len(users)} users with over 500 followers in London.")

    # Save user details to a CSV file
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Fetch repositories for each user and save to repositories.csv
    all_repos_data = []
    for user in users:
        print(f"Fetching repositories for {user['login']}...")
        repos = get_repositories(user['login'])
        all_repos_data.extend(repos)

    repos_df = pd.DataFrame(all_repos_data)
    repos_df.to_csv('repositories.csv', index=False)


def print_repositories():
    repos_df = pd.read_csv('repositories.csv')
    print(repos_df)

# Call the function to print repositories after fetching
if __name__ == "__main__":
    main()
    print_repositories()


Found 120 users with over 500 followers in London.
Fetching repositories for tj...
Fetching repositories for kunal-kushwaha...
Fetching repositories for angelabauer...
Fetching repositories for jlord...
Fetching repositories for Elfocrash...
Fetching repositories for alyssaxuu...
Fetching repositories for eddiejaoude...
Fetching repositories for daneden...
Fetching repositories for LaravelDaily...
Fetching repositories for jskeet...
Fetching repositories for mattdesl...
Fetching repositories for cloudflare...
Fetching repositories for nicklockwood...
Fetching repositories for jgthms...
Fetching repositories for bizz84...
Fetching repositories for samuelcolvin...
Fetching repositories for Lissy93...
Fetching repositories for jgilfelt...
Fetching repositories for sonnysangha...
Fetching repositories for nickbutcher...
Fetching repositories for florina-muntenescu...
Fetching repositories for pydanny...
Fetching repositories for SaraVieira...
Fetching repositories for dsyer...
Fetching rep