In [None]:
# train a simple classification model to predict if a repo is scaffolding or not

## logistic

In [1]:
# run imports
from sqlalchemy import text
import sqlalchemy
import psycopg2
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

In [2]:
# --- Database Connection and Feature Retrieval ---
cloud_sql_user = os.getenv("cloud_sql_user")
cloud_sql_password = os.getenv("cloud_sql_password")
cloud_sql_postgres_host = os.getenv("cloud_sql_postgres_host")
cloud_sql_postgres_db = os.getenv("cloud_sql_postgres_db")

# Construct the connection string
conn_str = (
    f"postgresql+psycopg2://"
    f"{cloud_sql_user}:{cloud_sql_password}@"
    f"{cloud_sql_postgres_host}/{cloud_sql_postgres_db}"
)

# Create the SQLAlchemy engine
try:
    cloud_sql_engine = create_engine(conn_str)
except Exception as e:
    raise Exception(f"Error creating database engine: {e}")

In [3]:
# Fetch the feature dataset from the database
try:
    with cloud_sql_engine.connect() as conn:
        query = text(
            """
            SELECT *
            FROM clean.project_repos_features
            """
        )
        repo_features_df = pd.read_sql(query, conn)
except Exception as e:
    raise Exception(f"Error fetching data from the database: {e}")

In [11]:
# --- Labeled Data Retrieval ---
gsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vShUKZQS6QFJ1SM1efqpFv-tXxbX6LFcJsc_L2MG-NtcXC-e9dGKgkbTSW39Zm6gfLIsUzkiWXa-CVE/pub?gid=1690796422&single=true&output=csv'
try:
    scaffold_df = pd.read_csv(gsheet_url)
except Exception as e:
    raise Exception(f"Error reading data from Google Sheets: {e}")

In [12]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = pd.merge(repo_features_df, scaffold_df, on='repo')

In [13]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = merged_df.dropna(subset=['is_scaffold'])

In [14]:
# print info about the merged dataframe
print(f"Merged dataframe info: {merged_df.info()}")

# print the first 5 rows of the merged dataframe
print(f"Merged dataframe first 5 rows: {merged_df.head()}")

# print the number of rows where is_scaffold is 1
print(f"Number of rows where is_scaffold is 1: {merged_df[merged_df['is_scaffold'] == 1].shape[0]}")

# print the number of rows where is_scaffold is 0
print(f"Number of rows where is_scaffold is 0: {merged_df[merged_df['is_scaffold'] == 0].shape[0]}")

# print the number of rows where is_scaffold is null
print(f"Number of rows where is_scaffold is null: {merged_df[merged_df['is_scaffold'].isnull()].shape[0]}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 63 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   repo                              458 non-null    object        
 1   data_timestamp                    458 non-null    datetime64[ns]
 2   has_readme                        458 non-null    bool          
 3   name_is_example                   458 non-null    bool          
 4   name_is_hello_world               458 non-null    bool          
 5   name_is_whitepaper                458 non-null    bool          
 6   name_is_tutorial                  458 non-null    bool          
 7   name_is_boilerplate               458 non-null    bool          
 8   name_is_scaffold                  458 non-null    bool          
 9   name_is_template                  458 non-null    bool          
 10  name_is_kit                       458 non-null    

In [15]:
# Separate features (X) and the target variable (y)
feature_columns = [
    'has_readme', # if false, then not education
    'is_collection_of_learnings', # if true, then is education
    'has_app_application', # if true, then not education
    'is_awesome_curated', # if true, then is education
    'has_benchmark', # if true, then not education
    'is_block_explorer', # if true, then not education
    'is_boilerplate_scaffold_template', # if true, then not education
    'is_bootcamp', # if true, then is education
    'is_bot', # if true, then not education
    'has_bounty_program', # if true, then not education
    'has_brand_icon_logo', # if true, then not education
    'is_cli_tool', # if true, then not education
    'is_library', # if true, then not education
    'is_course', # if true, then education
    'is_demo', # if true, then education
    'has_docs', # if true, then education
    'is_education_related', # if true, then education
    'is_eip_erc', # if true, then not education
    'has_examples', # if true, then education
    'is_feature_description', # if true, then education
    'is_starter_project', # if true, then education
    'is_guide', # if true, then education
    'is_hackathon_project', # if true, then education
    'is_hello_world', # if true, then education
    'uses_json_rpc', # if true, then not education
    'is_interview_related', # if true, then education
    'is_learning_material', # if true, then education
    'is_mcp_server', # if true, then not education
    'is_plugin', # if true, then not education
    'is_sample_project', # if true, then education
    'is_sdk', # if true, then not education
    'is_security_related', # if true, then not education
    'has_tests_testing', # if true, then not education
    'has_tips', # if true, then education
    'is_tooling', # if true, then not education
    'is_tutorial', # if true, then education
    'is_whitepaper', # if true, then education
    'is_workshop', # if true, then education
    'is_wrapper', # if true, then not education
    'is_experiment', # if true, then education
    'is_research', # if true, then education
    'name_is_example', # if true, then education
    'name_is_hello_world', # if true, then education
    'name_is_whitepaper', # if true, then education
    'name_is_tutorial', # if true, then education
    'name_is_boilerplate', # if true, then not education
    'name_is_scaffold', # if true, then not education
    'name_is_template', # if true, then not education
    'name_is_kit', # if true, then not education
    'name_is_starter', # if true, then education
    'name_is_getting_started', # if true, then education
    'name_is_quickstart', # if true, then education
    'name_is_guide', # if true, then education
    'name_is_hackathon', # if true, then education
    'name_is_bootcamp', # if true, then education
    'name_is_course', # if true, then education
    'name_is_workshop', # if true, then education
    'name_is_interview' # if true, then education
] 

X = merged_df[feature_columns]

# Ensure all feature data is numeric (booleans will be treated as 0s and 1s)
X = X.astype(float)

y = merged_df['is_scaffold']

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
# --- Model Training ---
# Initialize and train a simple classification model (Logistic Regression)
model = LogisticRegression(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [18]:
## ----------------------------------------------------- Model Evaluation ------------------------------------------------- ##

In [19]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [20]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.9348

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.96      0.97        89
        True       0.20      0.33      0.25         3

    accuracy                           0.93        92
   macro avg       0.59      0.64      0.61        92
weighted avg       0.95      0.93      0.94        92



In [21]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not scaffold")
print(f"False Positives (FP): {fp} - Incorrectly predicted as scaffold")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not scaffold (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as scaffold")

Confusion Matrix:
                 Predicted
                 False    True
Actual False    85       4       
       True     2        1       


True Negatives (TN): 85 - Correctly predicted not scaffold
False Positives (FP): 4 - Incorrectly predicted as scaffold
False Negatives (FN): 2 - Incorrectly predicted as not scaffold (missed)
True Positives (TP): 1 - Correctly predicted as scaffold


In [22]:
# Find the actual rows that were false negatives
# Create a boolean mask where the actual value was 1 but the prediction was 0
fn_mask = (y_test == 1) & (y_pred == 0)

# Use this mask to filter the original X_test DataFrame
false_negative_rows = X_test[fn_mask]

# use the index from X_test fn rows to lookup in the original df
original_fn_rows = merged_df.loc[false_negative_rows.index]

with pd.option_context('display.max_columns', None):
    print("\n--- Rows from the test set that were False Negatives ---")
    print(original_fn_rows)


--- Rows from the test set that were False Negatives ---
                                             repo             data_timestamp  \
221        https://github.com/0xZakk/starter-test 2025-06-09 20:18:11.606852   
192  https://github.com/arseneeth/aragon-tutorial 2025-06-09 20:18:11.606852   

     has_readme  name_is_example  name_is_hello_world  name_is_whitepaper  \
221       False            False                False               False   
192       False            False                False               False   

     name_is_tutorial  name_is_boilerplate  name_is_scaffold  \
221             False                False             False   
192              True                False             False   

     name_is_template  name_is_kit  name_is_starter  name_is_getting_started  \
221             False        False             True                    False   
192             False        False            False                    False   

     name_is_quickstart  name_is_gu

In [23]:
# X_train is our feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Get the importance (coefficients)
importance = model.coef_[0]

# Get the feature names from original DataFrame
feature_names = X_train.columns

# Create a pandas Series to pair feature names with their coefficients
coefficients = pd.Series(importance, index=feature_names)

# Sort the coefficients to see the most influential features
sorted_coefficients = coefficients.sort_values(ascending=False)

print("--- Most Influential Features ---")
print(sorted_coefficients)

--- Most Influential Features ---
is_boilerplate_scaffold_template    1.023628
name_is_boilerplate                 0.895787
name_is_template                    0.653833
has_readme                          0.582602
name_is_scaffold                    0.432372
is_hello_world                      0.428498
is_awesome_curated                  0.416744
is_workshop                         0.416507
is_tutorial                         0.159454
is_wrapper                          0.156352
has_tips                            0.151498
is_tooling                          0.145222
is_guide                            0.046954
is_cli_tool                         0.018579
has_bounty_program                  0.002797
is_learning_material                0.000614
is_education_related                0.000481
name_is_getting_started             0.000000
is_interview_related                0.000000
name_is_guide                       0.000000
name_is_quickstart                  0.000000
name_is_hello_world  