In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import requests
from sklearn.model_selection import cross_val_score

# Import the best performing sklearn linear models
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [5]:
GITHUB_TOPREPO_URL = lambda page: f"https://api.github.com/search/repositories?q=stars:%3E50&sort=stars&page={page}&per_page=100"
GITHUB_RATELIMIT_URL = 'https://api.github.com/rate_limit'

def github_request(url):
    response = requests.get(
        url=url,
        headers={
            f"Authorization": "Token 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"
        }
    )
    data = response.json()
    return data


In [17]:
# Fetch the data

def parse_repo(repo):
    return {
        "stars": repo['stargazers_count'],
        "forks": repo['forks_count'],
        "open_issues": repo['open_issues_count'],
        "topics_count": len(repo['topics']),
        "topics": repo['topics'],
        "disk_usage": repo["size"],
        "created_at": repo['created_at'],
        "owner_url": repo['owner']['url'],
        "followers": None,
        "following": None,
        "public_repos": None
    }

all_repos = []
# 100 Repos per page
for i in range(1, 11):
    # Fetch the data
    repos = github_request(GITHUB_TOPREPO_URL(i))['items']
    parsed_repos = [parse_repo(repo) for repo in repos]
    all_repos.extend(parsed_repos)

for repo in all_repos:
    data = github_request(repo['owner_url'])
    repo['followers'] = data['followers']
    repo['following'] = data['following']
    repo['public_repos'] = data['public_repos']
    
topics = [repo["topics"] for repo in all_repos]

mlb = MultiLabelBinarizer()
topics_transformed = mlb.fit_transform(topics)
topics_df = pd.DataFrame(topics_transformed, columns=mlb.classes_)

df = pd.DataFrame(all_repos).drop(columns=["topics", "owner_url"])
df = pd.concat([df, topics_df], axis=1)
date = pd.to_datetime(df['created_at'])
df['created_at'] = date.dt.year
df.to_csv('github_repos.csv', index=False)

In [19]:
y = df['stars']
X = df.drop(columns=['stars'])

s = StandardScaler()
X = s.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = [
    XGBRegressor(),
    LinearRegression(),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
]

for model in models:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    model.fit(X_train, y_train)

    print(f"Model: {model.__class__.__name__}")
    print(f"CV score: {cv_scores.mean()}")
    print(f"Test score: {model.score(X_test, y_test)}")
    print("\n")

Model: XGBRegressor
CV score: 0.3680079839878432
Test score: 0.5458737405583682


Model: LinearRegression
CV score: -5.807076606688153e+27
Test score: -1.4671262243978212e+28


Model: RandomForestRegressor
CV score: 0.44650500795226067
Test score: 0.5467417462996804


Model: GradientBoostingRegressor
CV score: 0.3929044969469536
Test score: 0.5024659047150856


