In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import json
import requests
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt

# Import the best performing sklearn linear models
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [2]:
GITHUB_TOPREPO_URL = lambda page: f"https://api.github.com/search/repositories?q=stars:%3E50&sort=stars&page={page}&per_page=100"
GITHUB_RATELIMIT_URL = 'https://api.github.com/rate_limit'

def github_request(url):
    response = requests.get(
        url=url,
        headers={
            f"Authorization": "Token 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"
        }
    )
    data = response.json()
    return data


In [19]:
# Fetch the data
size = 100
stars = 100
watchers = 100
forks = 100
open_issues = 100
number_of_topics=100
created_at="2019-01-01"

def parse_repo(repo):
    return {
        "stars": repo['stargazers_count'],
        #"watchers": repo['watchers_count'],
        "forks": repo['forks_count'],
        "open_issues": repo['open_issues_count'],
        "topics_count": len(repo['topics']),
        "topics": repo['topics'],
        "created_at": repo['created_at'],
    }

all_repos = []
# 100 Repos per page
for i in range(1, 11):
    # Fetch the data
    repos = github_request(GITHUB_TOPREPO_URL(i))['items']
    parsed_repos = [parse_repo(repo) for repo in repos]
    all_repos.extend(parsed_repos)
    
topics = [repo["topics"] for repo in all_repos]

mlb = MultiLabelBinarizer()
topics_transformed = mlb.fit_transform(topics)
topics_df = pd.DataFrame(topics_transformed, columns=mlb.classes_)

df = pd.DataFrame(all_repos).drop(columns=["topics"])
df = pd.concat([df, topics_df], axis=1)
date = pd.to_datetime(df['created_at'])
df['created_at'] = date.dt.year

In [21]:
y = df['stars']
X = df.drop(columns=['stars'])
s = StandardScaler()
X = s.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = [
    XGBRegressor(),
    LinearRegression(),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
]

for model in models:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    model.fit(X_train, y_train)

    print(f"Model: {model.__class__.__name__}")
    print(f"CV score: {cv_scores.mean()}")
    print(f"Test score: {model.score(X_test, y_test)}")
    print("\n")

Model: XGBRegressor
CV score: 0.2895791655562299
Test score: 0.5572827575000474


Model: LinearRegression
CV score: -7.81192440012511e+27
Test score: -1.9377513471898503e+28


Model: RandomForestRegressor
CV score: 0.39732769444641197
Test score: 0.4953215677515145


Model: GradientBoostingRegressor
CV score: 0.3484516411821338
Test score: 0.45055758181876593


