In [1]:
print("Hello World")

Hello World


# **All the imports that ere needed**

In [2]:
import requests
import pandas as pd
from datetime import datetime

# **GitHub Authentication Setup**

In [None]:
GITHUB_TOKEN = "YOUR_GITHUB_TOKEN_HERE"

HEADERS = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}

# **GitHub API Call with Exception Handling**

In [4]:
url = "https://api.github.com/search/repositories"
params = {
    "q": "language:python",
    "sort": "stars",
    "order": "desc",
    "per_page": 50
}
try:
    response = requests.get(url, headers=HEADERS, params=params, timeout=10)
    response.raise_for_status()
    data = response.json()
    if data:
        print("Data fetched successfully")

except requests.exceptions.Timeout:
    raise Exception("Request timed out")

except requests.exceptions.HTTPError as e:
    raise Exception(f"HTTP error occurred: {e}")

except requests.exceptions.RequestException as e:
    raise Exception(f"Request failed: {e}")

Data fetched successfully


# **Extracting Repository Items**

In [5]:
items = data.get("items", [])
len(items)

50

# **Normalizing JSON into the DataFrame**

In [6]:
df = pd.json_normalize(items)
df.head()

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,license.name,license.spdx_id,license.url,license.node_id,permissions.admin,permissions.maintain,permissions.push,permissions.triage,permissions.pull,license
0,54346799,MDEwOlJlcG9zaXRvcnk1NDM0Njc5OQ==,public-apis,public-apis/public-apis,False,https://github.com/public-apis/public-apis,A collective list of free APIs,False,https://api.github.com/repos/public-apis/publi...,https://api.github.com/repos/public-apis/publi...,...,MIT License,MIT,https://api.github.com/licenses/mit,MDc6TGljZW5zZTEz,False,False,False,False,True,
1,13491895,MDEwOlJlcG9zaXRvcnkxMzQ5MTg5NQ==,free-programming-books,EbookFoundation/free-programming-books,False,https://github.com/EbookFoundation/free-progra...,:books: Freely available programming books,False,https://api.github.com/repos/EbookFoundation/f...,https://api.github.com/repos/EbookFoundation/f...,...,Creative Commons Attribution 4.0 International,CC-BY-4.0,https://api.github.com/licenses/cc-by-4.0,MDc6TGljZW5zZTI1,False,False,False,False,True,
2,83222441,MDEwOlJlcG9zaXRvcnk4MzIyMjQ0MQ==,system-design-primer,donnemartin/system-design-primer,False,https://github.com/donnemartin/system-design-p...,Learn how to design large-scale systems. Prep ...,False,https://api.github.com/repos/donnemartin/syste...,https://api.github.com/repos/donnemartin/syste...,...,Other,NOASSERTION,,MDc6TGljZW5zZTA=,False,False,False,False,True,
3,21289110,MDEwOlJlcG9zaXRvcnkyMTI4OTExMA==,awesome-python,vinta/awesome-python,False,https://github.com/vinta/awesome-python,An opinionated list of awesome Python framewor...,False,https://api.github.com/repos/vinta/awesome-python,https://api.github.com/repos/vinta/awesome-pyt...,...,Other,NOASSERTION,,MDc6TGljZW5zZTA=,False,False,False,False,True,
4,63476337,MDEwOlJlcG9zaXRvcnk2MzQ3NjMzNw==,Python,TheAlgorithms/Python,False,https://github.com/TheAlgorithms/Python,All Algorithms implemented in Python,False,https://api.github.com/repos/TheAlgorithms/Python,https://api.github.com/repos/TheAlgorithms/Pyt...,...,MIT License,MIT,https://api.github.com/licenses/mit,MDc6TGljZW5zZTEz,False,False,False,False,True,


# **Checking for NaN enteries**

In [7]:
df.isna().sum()

id                       0
node_id                  0
name                     0
full_name                0
private                  0
                        ..
permissions.maintain     0
permissions.push         0
permissions.triage       0
permissions.pull         0
license                 50
Length: 108, dtype: int64

# **Checking for Data information**

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Columns: 108 entries, id to license
dtypes: bool(19), float64(2), int64(10), object(77)
memory usage: 35.8+ KB


# **Selecting the Relevant Columns only**

In [9]:
columns = [
    "name",
    "full_name",
    "owner.login",
    "stargazers_count",
    "forks_count",
    "open_issues_count",
    "language",
    "created_at",
    "updated_at",
    "license.name",
    "html_url"
]

df = df[columns]
df.head()


Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20T23:49:42Z,2025-12-15T15:36:50Z,MIT License,https://github.com/public-apis/public-apis
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11T06:50:37Z,2025-12-15T15:33:57Z,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26T16:15:28Z,2025-12-15T14:55:13Z,Other,https://github.com/donnemartin/system-design-p...
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27T21:00:06Z,2025-12-15T15:36:58Z,Other,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16T09:44:01Z,2025-12-15T15:28:16Z,MIT License,https://github.com/TheAlgorithms/Python


# Data Cleaning

# **Removing repositories with missing language**

In [10]:
df_clean = df.dropna(subset=["language"])
df_clean.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20T23:49:42Z,2025-12-15T15:36:50Z,MIT License,https://github.com/public-apis/public-apis
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11T06:50:37Z,2025-12-15T15:33:57Z,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26T16:15:28Z,2025-12-15T14:55:13Z,Other,https://github.com/donnemartin/system-design-p...
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27T21:00:06Z,2025-12-15T15:36:58Z,Other,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16T09:44:01Z,2025-12-15T15:28:16Z,MIT License,https://github.com/TheAlgorithms/Python


# **Filling missing license with "No License"**

In [11]:
df_clean["license.name"] = df_clean["license.name"].fillna("No License")
df_clean.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20T23:49:42Z,2025-12-15T15:36:50Z,MIT License,https://github.com/public-apis/public-apis
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11T06:50:37Z,2025-12-15T15:33:57Z,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26T16:15:28Z,2025-12-15T14:55:13Z,Other,https://github.com/donnemartin/system-design-p...
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27T21:00:06Z,2025-12-15T15:36:58Z,Other,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16T09:44:01Z,2025-12-15T15:28:16Z,MIT License,https://github.com/TheAlgorithms/Python


# **Data Type Conversion dateTime only**

In [12]:
df_clean["created_at"] = pd.to_datetime(df_clean["created_at"])
df_clean["updated_at"] = pd.to_datetime(df_clean["updated_at"])
df_clean.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11 06:50:37+00:00,2025-12-15 15:33:57+00:00,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26 16:15:28+00:00,2025-12-15 14:55:13+00:00,Other,https://github.com/donnemartin/system-design-p...
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python


# Feature Engineering

# **Adding Repository age (days) & Stars-to-forks ratio**

In [13]:
today = pd.Timestamp.utcnow()

df_clean["repo_age_days"] = (today - df_clean["created_at"]).dt.days

df_clean["stars_forks_ratio"] = (
    df_clean["stargazers_count"] /
    df_clean["forks_count"].replace(0, 1)
)

df_clean.head()


Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11 06:50:37+00:00,2025-12-15 15:33:57+00:00,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...,4448,5.772467
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26 16:15:28+00:00,2025-12-15 14:55:13+00:00,Other,https://github.com/donnemartin/system-design-p...,3213,6.142026
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python,4188,10.180365
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python,3439,4.330034


# **Adding filters to get good data (stars, repo age, owners)**

In [14]:
df_filtered = df_clean[
    (df_clean["stargazers_count"] > 10_000) &
    (df_clean["repo_age_days"] > 365) &
    (df_clean["owner.login"].str.len() > 2)
]

df_filtered.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11 06:50:37+00:00,2025-12-15 15:33:57+00:00,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...,4448,5.772467
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26 16:15:28+00:00,2025-12-15 14:55:13+00:00,Other,https://github.com/donnemartin/system-design-p...,3213,6.142026
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python,4188,10.180365
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python,3439,4.330034


# **Rechecking of Null enteries after all processing**

In [15]:
print(df_filtered.shape)
print(df_filtered.isnull().sum())

(49, 13)
name                 0
full_name            0
owner.login          0
stargazers_count     0
forks_count          0
open_issues_count    0
language             0
created_at           0
updated_at           0
license.name         0
html_url             0
repo_age_days        0
stars_forks_ratio    0
dtype: int64


# **Saving the most updated & cleaned data**

In [16]:
output_file = "cleaned_popular_python_repos.csv"
df_filtered.to_csv(output_file, index=False)
output_file


'cleaned_popular_python_repos.csv'

# Dividing the data into differnet CSVs

# **Applying filters to Search Repositories by Owner Name**

In [17]:
org_repos = df_clean[
    df_clean["owner.login"].str.contains("-", na=False)
]

org_repos.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873
5,AutoGPT,Significant-Gravitas/AutoGPT,Significant-Gravitas,180304,46182,317,Python,2023-03-16 09:21:07+00:00,2025-12-15 15:20:27+00:00,Other,https://github.com/Significant-Gravitas/AutoGPT,1005,3.904205
8,langflow,langflow-ai/langflow,langflow-ai,141197,8168,935,Python,2023-02-08 22:28:03+00:00,2025-12-15 15:32:31+00:00,MIT License,https://github.com/langflow-ai/langflow,1040,17.286606
9,youtube-dl,ytdl-org/youtube-dl,ytdl-org,139168,10575,4119,Python,2010-10-31 14:35:07+00:00,2025-12-15 14:18:01+00:00,The Unlicense,https://github.com/ytdl-org/youtube-dl,5524,13.160095
10,yt-dlp,yt-dlp/yt-dlp,yt-dlp,138333,11172,2224,Python,2020-10-26 04:22:55+00:00,2025-12-15 14:55:11+00:00,The Unlicense,https://github.com/yt-dlp/yt-dlp,1876,12.382116


# **Applying filteres to Search by License Type**

In [18]:
licensed_repos = df_clean[
    df_clean["license.name"].isin(["MIT License", "Apache License 2.0"])
]

licensed_repos.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python,3439,4.330034
7,transformers,huggingface/transformers,huggingface,153892,31428,2160,Python,2018-10-29 13:56:00+00:00,2025-12-15 15:21:49+00:00,Apache License 2.0,https://github.com/huggingface/transformers,2604,4.896653
8,langflow,langflow-ai/langflow,langflow-ai,141197,8168,935,Python,2023-02-08 22:28:03+00:00,2025-12-15 15:32:31+00:00,MIT License,https://github.com/langflow-ai/langflow,1040,17.286606
12,langchain,langchain-ai/langchain,langchain-ai,121963,20123,278,Python,2022-10-17 02:58:36+00:00,2025-12-15 15:34:14+00:00,MIT License,https://github.com/langchain-ai/langchain,1155,6.060876


# **Applying Filter by Popularity Buckets**

In [19]:
def popularity_bucket(stars):
    if stars >= 50000:
        return "Very High"
    elif stars >= 20000:
        return "High"
    elif stars >= 10000:
        return "Medium"
    else:
        return "Low"

df_clean["popularity_level"] = df_clean["stargazers_count"].apply(popularity_bucket)

df_clean["popularity_level"].value_counts()


popularity_level
Very High    50
Name: count, dtype: int64

# **Applying Filters by Popularity Level**

In [20]:
high_popularity = df_clean[
    df_clean["popularity_level"].isin(["High", "Very High"])
]

high_popularity.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio,popularity_level
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873,Very High
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11 06:50:37+00:00,2025-12-15 15:33:57+00:00,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...,4448,5.772467,Very High
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26 16:15:28+00:00,2025-12-15 14:55:13+00:00,Other,https://github.com/donnemartin/system-design-p...,3213,6.142026,Very High
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python,4188,10.180365,Very High
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python,3439,4.330034,Very High


# **Applying filters to get Recently Updated Repositories**

In [21]:
recent_repos = df_clean[
    (pd.Timestamp.utcnow() - df_clean["updated_at"]).dt.days <= 90
]

recent_repos.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio,popularity_level
0,public-apis,public-apis/public-apis,public-apis,384474,41033,752,Python,2016-03-20 23:49:42+00:00,2025-12-15 15:36:50+00:00,MIT License,https://github.com/public-apis/public-apis,3556,9.369873,Very High
1,free-programming-books,EbookFoundation/free-programming-books,EbookFoundation,378847,65630,204,Python,2013-10-11 06:50:37+00:00,2025-12-15 15:33:57+00:00,Creative Commons Attribution 4.0 International,https://github.com/EbookFoundation/free-progra...,4448,5.772467,Very High
2,system-design-primer,donnemartin/system-design-primer,donnemartin,329403,53631,516,Python,2017-02-26 16:15:28+00:00,2025-12-15 14:55:13+00:00,Other,https://github.com/donnemartin/system-design-p...,3213,6.142026,Very High
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python,4188,10.180365,Very High
4,Python,TheAlgorithms/Python,TheAlgorithms,214826,49613,797,Python,2016-07-16 09:44:01+00:00,2025-12-15 15:28:16+00:00,MIT License,https://github.com/TheAlgorithms/Python,3439,4.330034,Very High


# **Applying Filters by Stars-to-Forks Efficiency**

In [22]:
efficient_repos = df_clean[
    df_clean["stars_forks_ratio"] >= 10
]

efficient_repos.head()

Unnamed: 0,name,full_name,owner.login,stargazers_count,forks_count,open_issues_count,language,created_at,updated_at,license.name,html_url,repo_age_days,stars_forks_ratio,popularity_level
3,awesome-python,vinta/awesome-python,vinta,273862,26901,526,Python,2014-06-27 21:00:06+00:00,2025-12-15 15:36:58+00:00,Other,https://github.com/vinta/awesome-python,4188,10.180365,Very High
8,langflow,langflow-ai/langflow,langflow-ai,141197,8168,935,Python,2023-02-08 22:28:03+00:00,2025-12-15 15:32:31+00:00,MIT License,https://github.com/langflow-ai/langflow,1040,17.286606,Very High
9,youtube-dl,ytdl-org/youtube-dl,ytdl-org,139168,10575,4119,Python,2010-10-31 14:35:07+00:00,2025-12-15 14:18:01+00:00,The Unlicense,https://github.com/ytdl-org/youtube-dl,5524,13.160095,Very High
10,yt-dlp,yt-dlp/yt-dlp,yt-dlp,138333,11172,2224,Python,2020-10-26 04:22:55+00:00,2025-12-15 14:55:11+00:00,The Unlicense,https://github.com/yt-dlp/yt-dlp,1876,12.382116,Very High
11,HelloGitHub,521xueweihan/HelloGitHub,521xueweihan,137308,10992,231,Python,2016-05-04 06:24:11+00:00,2025-12-15 15:36:33+00:00,No License,https://github.com/521xueweihan/HelloGitHub,3512,12.49163,Very High


# **Saving Organization Repositories only**

In [23]:
org_repos.to_csv("github_org_repos.csv", index=False)

# **Save Licensed Repositories only**

In [24]:
licensed_repos.to_csv("github_licensed_repos.csv", index=False)

# **Saving High Popularity Repositories only**

In [25]:
high_popularity.to_csv("github_high_popularity_repos.csv", index=False)

# **Saving Recently Updated Repositories only**

In [26]:
recent_repos.to_csv("github_recent_repos.csv", index=False)

# **Saving High Engagement Repositories only**

In [27]:
efficient_repos.to_csv("github_high_engagement_repos.csv", index=False)