# 1. Install required libraries

In [1]:
pip install python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [31]:
import requests
import json
import time
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
import os

# 2. Initial config

In [32]:
BASE_URL = "https://api.github.com"

# For this step is recommended to have a Github Personal access token (PAT)
# Go to GitHub Settings > Developer settings > Personal access tokens
TOKEN = os.getenv("GITHUB_TOKEN")  # Replace with your (PAT)

HEADERS = {
    "Authorization": f"token {TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

response = requests.get("https://api.github.com/user", headers=HEADERS)
print(response.status_code, response.text)

401 {"message":"Bad credentials","documentation_url":"https://docs.github.com/rest","status":"401"}


#3.1. Rate limit

In [4]:
def handle_rate_limit(response):
    if response.status_code in (403, 429):
        reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
        current_time = int(time.time())
        wait_time = reset_time - current_time + 1
        if wait_time > 0:
            print(f"Rate limit exceeded. Waiting {wait_time} seconds...")
            time.sleep(wait_time)
    return response


#3.2. API Request

In [5]:
def make_api_request(url, params=None):
    try:
        response = requests.get(url, headers=HEADERS, params=params)
        response = handle_rate_limit(response)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error {response.status_code}: {response.text}")
            return None
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return None

#4.1. Search repositories

In [6]:
def search_repositories(query, sort="stars", order="desc", per_page=30):
    url = f"{BASE_URL}/search/repositories"
    params = {
        "q": query,
        "sort": sort,
        "order": order,
        "per_page": per_page
    }
    data = make_api_request(url, params)
    if data:
        return data.get("items", [])
    return []

# 4.2. Get repo function

In [7]:
def get_repository_commits(owner, repo, per_page=30):
    url = f"{BASE_URL}/repos/{owner}/{repo}/commits"
    params = {"per_page": per_page}

    return make_api_request(url, params)



#4.3. Get contents function

In [8]:
def get_repository_contents(owner, repo, path=""):
    url = f"{BASE_URL}/repos/{owner}/{repo}/contents/{path}"

    return make_api_request(url)

#4.1. Paginate repositories

In [9]:
def paginate_repositories(query, max_pages=3, per_page=30):
    all_repos = []
    for page in range(1, max_pages + 1):
        params = {
            "q": query,
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }
        url = f"{BASE_URL}/search/repositories"
        response = make_api_request(url, params)
        if response and "items" in response:
            all_repos.extend(response["items"])
        else:
            break  # No more pages
    return all_repos

#4.2 Data wrangling functions


In [10]:
def clean_repos_data(repos_list):

    df = pd.DataFrame(repos_list)

    df['created_at'] = pd.to_datetime(df['created_at'])
    df['description'] = df['description'].fillna('No description')
    df['owner_name'] = df['owner'].apply(lambda x: x['login'])

    clean_df = df[['name', 'full_name', 'description', 'language',
                   'stargazers_count', 'created_at', 'owner_name']]

    print(f"Cleaned {len(clean_df)} repositories")
    return clean_df

In [11]:
def clean_commits_data(commits_list):

    df = pd.DataFrame(commits_list)

    df['author_name'] = df['commit'].apply(lambda x: x.get('author', {}).get('name', 'Unknown'))
    df['author_email'] = df['commit'].apply(lambda x: x.get('author', {}).get('email', 'Unknown'))
    df['commit_date'] = df['commit'].apply(lambda x: x.get('author', {}).get('date', ''))
    df['message'] = df['commit'].apply(lambda x: x.get('message', ''))

    df['commit_date'] = pd.to_datetime(df['commit_date'])

    df['message_clean'] = df['message'].str.strip().str.replace('\n', ' ')

    clean_df = df[['sha', 'author_name', 'author_email', 'commit_date',
                   'message', 'message_clean']]

    print(f"Cleaned {len(clean_df)} commits")
    return clean_df

#5.1 Examples basic functions

In [12]:
repo_1 = search_repositories("tetris language:assembly", per_page=5, sort = 'name', order="asc")


In [13]:
repo_1

[{'id': 791379254,
  'node_id': 'R_kgDOLyt9Ng',
  'name': '-Nand2Tetris',
  'full_name': 'pliniohavila/-Nand2Tetris',
  'private': False,
  'owner': {'login': 'pliniohavila',
   'id': 76018725,
   'node_id': 'MDQ6VXNlcjc2MDE4NzI1',
   'avatar_url': 'https://avatars.githubusercontent.com/u/76018725?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/pliniohavila',
   'html_url': 'https://github.com/pliniohavila',
   'followers_url': 'https://api.github.com/users/pliniohavila/followers',
   'following_url': 'https://api.github.com/users/pliniohavila/following{/other_user}',
   'gists_url': 'https://api.github.com/users/pliniohavila/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/pliniohavila/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/pliniohavila/subscriptions',
   'organizations_url': 'https://api.github.com/users/pliniohavila/orgs',
   'repos_url': 'https://api.github.com/users/pliniohavila/repos',
   'events_url': '

In [21]:
comm_1 = get_repository_commits("kirjavascript", 'TetrisGYM', per_page=15)


In [22]:
comm_1

[{'sha': 'aad48e307c78d75fc6b4d05888c1529da4deeb9f',
  'node_id': 'C_kwDOEXsaitoAKGFhZDQ4ZTMwN2M3OGQ3NWZjNmI0ZDA1ODg4YzE1MjlkYTRkZWViOWY',
  'commit': {'author': {'name': 'zohassadar',
    'email': 'zohassadar@gmail.com',
    'date': '2025-06-20T21:06:07Z'},
   'committer': {'name': 'GitHub',
    'email': 'noreply@github.com',
    'date': '2025-06-20T21:06:07Z'},
   'message': 'Add marathon modes 3 & 4 (#140)',
   'tree': {'sha': 'fe2d9594b2c18e95273f84ea99fa5a89927f9d29',
    'url': 'https://api.github.com/repos/kirjavascript/TetrisGYM/git/trees/fe2d9594b2c18e95273f84ea99fa5a89927f9d29'},
   'url': 'https://api.github.com/repos/kirjavascript/TetrisGYM/git/commits/aad48e307c78d75fc6b4d05888c1529da4deeb9f',
   'comment_count': 0,
   'verification': {'verified': True,
    'reason': 'valid',
    'signature': '-----BEGIN PGP SIGNATURE-----\n\nwsFcBAABCAAQBQJoVc0/CRC1aQ7uu5UhlAAAlDUQAD0tzkeClTL0+eVd4ggnCJaN\nwyHohqgQ92TRqvABrP7BxkXFT+w2G28MK8M1Tqsw/vvvAhMM/gbQtQa9WR9QeTG8\nnCAXqt6a5PeyqMPWg

In [16]:
repo_content1 = get_repository_contents("kirjavascript", 'TetrisGYM', path="README.md")

In [17]:
repo_content1

{'name': 'README.md',
 'path': 'README.md',
 'sha': '6e189300b931ee7569ec71d64634fe153f1047d5',
 'size': 13358,
 'url': 'https://api.github.com/repos/kirjavascript/TetrisGYM/contents/README.md?ref=master',
 'html_url': 'https://github.com/kirjavascript/TetrisGYM/blob/master/README.md',
 'git_url': 'https://api.github.com/repos/kirjavascript/TetrisGYM/git/blobs/6e189300b931ee7569ec71d64634fe153f1047d5',
 'download_url': 'https://raw.githubusercontent.com/kirjavascript/TetrisGYM/master/README.md',
 'type': 'file',
 'content': 'IyBUZXRyaXNHWU0KCjxkaXYgYWxpZ249ImNlbnRlciI+CiAgICA8aW1nIHNy\nYz0iLi9hc3NldHMvc2NyZWVucy9tZW51Ni5wbmciIGFsdD0iTWVudXNjcmVl\nbiI+CiAgICA8YnI+CjwvZGl2Pgo8YnI+CgoqIFtHZXR0aW5nIFN0YXJ0ZWRd\nKCNnZXR0aW5nLXN0YXJ0ZWQpCiogW1RyYWluZXJzXSgjdHJhaW5lcnMpCiAg\nICAqIFtUZXRyaXNdKCN0ZXRyaXMpCiAgICAqIFtULVNwaW5zXSgjdC1zcGlu\ncykKICAgICogW1NlZWRdKCNzZWVkKQogICAgKiBbU3RhY2tpbmddKCNzdGFj\na2luZykKICAgICogW1BhY2VdKCNwYWNlKQogICAgKiBbU2V0dXBzXSgjc2V0\ndXBzKQogICAgKiBbQi1UeXBlXSgjYi10eXB

#5.1 Examples data wrangling functions

In [18]:
clean_repos_data(repo_1)

Cleaned 5 repositories


Unnamed: 0,name,full_name,description,language,stargazers_count,created_at,owner_name
0,-Nand2Tetris,pliniohavila/-Nand2Tetris,No description,Assembly,0,2024-04-24 16:01:19+00:00,pliniohavila
1,-nand2tetris-mynand-,hyfzero/-nand2tetris-mynand-,整理了我的作业（并制作了pdf版本）和nand2课程所需的所有资料,Assembly,2,2021-01-21 04:33:53+00:00,hyfzero
2,-nand2tetris-Part1,VEERESH069/-nand2tetris-Part1,No description,Assembly,0,2024-08-21 09:07:13+00:00,VEERESH069
3,-nand2tetris-Part1,Saksham091/-nand2tetris-Part1,No description,Assembly,0,2022-11-24 05:42:16+00:00,Saksham091
4,-nand2tetris-Part1-,pawin46/-nand2tetris-Part1-,No description,Assembly,0,2024-08-27 09:16:51+00:00,pawin46


In [23]:
clean_commits_data(comm_1)

Cleaned 15 commits


Unnamed: 0,sha,author_name,author_email,commit_date,message,message_clean
0,aad48e307c78d75fc6b4d05888c1529da4deeb9f,zohassadar,zohassadar@gmail.com,2025-06-20 21:06:07+00:00,Add marathon modes 3 & 4 (#140),Add marathon modes 3 & 4 (#140)
1,d4ffde98d5629ef4790ffa47699f7cb439631e8d,zohassadar,zohassadar@gmail.com,2025-03-26 22:22:44+00:00,Clean unused init bytes (#136)\n\nThese bytes ...,Clean unused init bytes (#136) These bytes ar...
2,df937b12806b35332880d37057188d052d859260,zohassadar,zohassadar@gmail.com,2025-02-01 19:49:33+00:00,Add option to send additional build args to ca...,Add option to send additional build args to ca...
3,d28bd744b5f784a3bacfcb0acf7ac5f4fcc85de2,zohassadar,zohassadar@gmail.com,2025-01-30 22:47:24+00:00,remove unneeded legacy tables (#132)\n\norient...,remove unneeded legacy tables (#132) orientat...
4,961aa7aed01dc5a2d73f33d07b7ba488fa4202a5,zohassadar,zohassadar@gmail.com,2025-01-30 20:05:48+00:00,add harddrop.rs (#131)\n\nPut together to meas...,add harddrop.rs (#131) Put together to measur...
5,a222aa463090986c3ff7f82b9213bec791055d34,Kirjava,kirjavascript@users.noreply.github.com,2025-01-05 19:39:31+00:00,Merge pull request #129 from zohassadar/clear_...,Merge pull request #129 from zohassadar/clear_...
6,d216a5e218f023b9f03d4affb4170f4ce0efefcb,zohassadar,zohassadar@gmail.com,2025-01-05 19:18:26+00:00,Clear crashState at game init,Clear crashState at game init
7,9ad2faacc6cfd919782330659468028aea04533b,Kirjava,kirjavascript@users.noreply.github.com,2025-01-05 02:25:56+00:00,Merge pull request #127 from zohassadar/floor-...,Merge pull request #127 from zohassadar/floor-...
8,0983e24379a6195421f71da7f6a09adb9f325316,zohassadar,zohassadar@gmail.com,2025-01-05 02:12:46+00:00,sec in floor and adjust tspin constant,sec in floor and adjust tspin constant
9,810a07f5cabc7cbbc646d24e104ab762ef7aa133,kirjavascript,snkenjoi@gmail.com,2025-01-05 01:15:54+00:00,revert sec in floor code,revert sec in floor code


#6.1. Test 1: Bad authentication

In [28]:
BASE_URL = "https://api.github.com"

TOKEN = "github_123456789"

HEADERS = {
    "Authorization": f"token {TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

response = requests.get("https://api.github.com/user", headers=HEADERS)
print(response.status_code, response.text)

401 {"message":"Bad credentials","documentation_url":"https://docs.github.com/rest","status":"401"}


#6.2. Test 2: Rate limit checking

In [30]:
print("Remaining:", response.headers.get("X-RateLimit-Remaining"))


Remaining: 4993


#6.3 Test 3: Pagination

In [None]:
repos = paginate_repositories("tetris language:assembly", max_pages=2)
print(f"Total repos fetched: {len(repos)}")
print("Example repo:", repos[0]["full_name"])

Total repos fetched: 60
Example repo: daniel-e/tetros


In [None]:
def paginate_repositories(query, max_pages=3, per_page=30):
    all_repos = []
    for page in range(1, max_pages + 1):
        params = {
            "q": query,
            "sort": "stars",
            "order": "desc",
            "per_page": per_page,
            "page": page
        }
        url = f"{BASE_URL}/search/repositories"
        response = make_api_request(url, params)
        if response and "items" in response:
            all_repos.extend(response["items"])
        else:
            break
    return all_repos


In [None]:
repos = paginate_repositories("tetris language:assembly", max_pages=2)
print(f"Total repos fetched: {len(repos)}")
print("Example repo:", repos[0]["full_name"])


Total repos fetched: 60
Example repo: daniel-e/tetros
