# Scrape Using GitHub API

In [None]:
import pandas as pd
import numpy as np

import requests

def search_users_in_bangalore():
    url = "https://api.github.com/search/users?q=followers%3A%3E100+location%3ABanglore&ref=searchresults&s=followers&type=Users&per_page=100"
    #params = {"q": "location:'Banglore'%2Bfollowers:>100", "per_page": 100,"type":"Users"}
    users_with_min_followers = []

    response = requests.get(url)
    data = response.json()
    return data['items']

users = search_users_in_bangalore()
print(users)

def fetch_users_from_github():

  users_data = []
  for user_ind in users:
    user_url = user_ind['url'];
    user_response = requests.get(user_url)
    user = user_response.json()
    user_info = {
        'login': user['login'],
        'id': user['id'],
        'name': user['name'],
        'company': user['company'],
        'blog': user['blog'],
        'location': user['location'],
        'email': user['email'],
        'hireable': user['hireable'],
        'bio': user['bio'],
        'public_repos': user['public_repos'],
        'followers': user['followers'],
        'following': user['following'],
        'created_at': user['created_at']
    }
    users_data.append(user_info)
  return pd.DataFrame(users_data)

df = fetch_users_from_github()
print(df)

df.to_csv('users.csv')

def fetch_repos_from_github():

  repo_data = []
  for user_ind in users:
    repo_url = user_ind['repos_url'];
    repo_response = requests.get(repo_url)
    repos = repo_response.json()
    for repo in repos:
      repo_info = {
          'login': user_ind['login'],
          'id': repo['id'],
          'full_name': repo['full_name'],
          'stargazers_count': repo['stargazers_count'],
          'watchers_count': repo['watchers_count'],
          'language': repo['language'],
          'has_projects': repo['has_projects'],
          'has_wiki': repo['has_wiki'],
          'license_name': repo['license'],
          'created_at': repo['created_at']
      }
      repo_data.append(repo_info)
  return pd.DataFrame(repo_data)

df_repo = fetch_repos_from_github()
print(df_repo)

df_repo.to_csv('repositories.csv')

In [None]:
GITHUB_TOKEN = '<<GITHUB TOKEN>>'

def check_rate_limit():
    url = 'https://api.github.com/rate_limit'
    headers = {'Authorization': f'token {GITHUB_TOKEN}'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        rate_limit_info = response.json()
        core_limit = rate_limit_info['resources']['core']
        search_limit = rate_limit_info['resources']['search']

        print("Core Rate Limit:")
        print(f"- Remaining: {core_limit['remaining']}")
        print(f"- Reset: {core_limit['reset']}")

        print("Search Rate Limit:")
        print(f"- Remaining: {search_limit['remaining']}")
        print(f"- Reset: {search_limit['reset']}")
    else:
        print(f"Failed to check rate limit: {response.status_code} - {response.text}")

if __name__ == "__main__":
    check_rate_limit()

# Scrape Using PyGithub

In [None]:
!pip install PyGithub

In [None]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from github import Github as PyGithubGithub

GITHUB_TOKEN = '<<GITHUB TOKEN>>'
github_instance = PyGithubGithub(GITHUB_TOKEN)

def fetch_users_with_criteria():
    page = 1
    url = 'https://api.github.com/search/users'
    users_data = []

    while True:
        q_params = {
            'q': 'location:Bangalore followers:>100',
            'per_page': 100,
            'page': page
        }
        response = requests.get(url, headers={"Authorization": f"token {GITHUB_TOKEN}"}, params=q_params)
        data = response.json()

        if 'items' not in data or not data['items']:
            break

        users_data.extend(data['items'])
        page += 1

    return users_data

def strip_company_name(companyName):
    if companyName:
        return companyName.strip().lstrip('@').upper()
    return companyName

def fetch_users_from_github(user):
    user_data = []
    repo_data = []

    try:
        user_details = github_instance.get_user(user['login'])

        company_corr = strip_company_name(user_details.company)

        user_data = [
            user_details.login,
            user_details.name,
            company_corr,
            user_details.location,
            user_details.email,
            user_details.hireable,
            user_details.bio,
            user_details.public_repos,
            user_details.followers,
            user_details.following,
            user_details.created_at
        ]

        # We don't need to fetch more than 500 repos so stop at 500
        repos = user_details.get_repos()
        for i, repo in enumerate(repos):
            if i >= 500:
                break
            repo_data.append([
                user_details.login,
                repo.full_name,
                repo.created_at,
                repo.stargazers_count,
                repo.watchers_count,
                repo.language,
                repo.has_projects,
                repo.has_wiki,
                repo.license.name if repo.license else None
            ])

    except Exception as exp:
        print(f"{user['login']}: {exp}")

    return user_data, repo_data

def start():

    # First fetch users in Banagalore with > 100 followers
    users_data = fetch_users_with_criteria()

    user_list = []
    repo_list = []

    # Iterate over all the users and get their user details and repos
    # Considering this is overwhelming API calls, implement TPE
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = list(executor.map(fetch_users_from_github, users_data))

        for user_data, repo_data in results:
            if user_data:
                user_list.append(user_data)
            if repo_data:
                repo_list.extend(repo_data)

    # Create CSV for users
    users_df = pd.DataFrame(user_list, columns=[
        'login', 'name', 'company', 'location', 'email', 'hireable',
        'bio', 'public_repos', 'followers', 'following', 'created_at'])

    users_df.to_csv('users.csv', index=False) # create without indexes since we dont need them in the CSV first column

    # Create CSV for repositories
    repos_df = pd.DataFrame(repo_list, columns=[
        'login', 'full_name', 'created_at', 'stargazers_count',
        'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])

    repos_df.to_csv('repositories.csv', index=False) # create without indexes since we dont need them in the CSV first column

start()
print("!! All Done !!")