In [None]:
# train a simple classification model to predict if a repo is educational or not

## logistic

In [32]:
# run imports
from sqlalchemy import text
import sqlalchemy
import psycopg2
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

In [2]:
# --- Database Connection and Feature Retrieval ---
cloud_sql_user = os.getenv("cloud_sql_user")
cloud_sql_password = os.getenv("cloud_sql_password")
cloud_sql_postgres_host = os.getenv("cloud_sql_postgres_host")
cloud_sql_postgres_db = os.getenv("cloud_sql_postgres_db")

# Construct the connection string
conn_str = (
    f"postgresql+psycopg2://"
    f"{cloud_sql_user}:{cloud_sql_password}@"
    f"{cloud_sql_postgres_host}/{cloud_sql_postgres_db}"
)

# Create the SQLAlchemy engine
try:
    cloud_sql_engine = create_engine(conn_str)
except Exception as e:
    raise Exception(f"Error creating database engine: {e}")

In [3]:
# Fetch the feature dataset from the database
try:
    with cloud_sql_engine.connect() as conn:
        query = text(
            """
            SELECT 
                repo,
                has_readme,
                is_collection_of_learnings,
                has_app_application,
                is_awesome_curated,
                has_benchmark,
                is_block_explorer,
                is_boilerplate_scaffold_template,
                is_bootcamp,
                is_bot,
                has_bounty_program,
                has_brand_icon_logo,
                is_cli_tool,
                is_library,
                is_course,
                is_demo,
                has_docs,
                is_education_related,
                is_eip_erc,
                has_examples,
                is_feature_description,
                is_starter_project,
                is_guide,
                is_hackathon_project,
                is_hello_world,
                uses_json_rpc,
                is_interview_related,
                is_learning_material,
                is_mcp_server,
                is_plugin,
                is_sample_project,
                is_sdk,
                is_security_related,
                has_tests_testing,
                has_tips,
                is_tooling,
                is_tutorial,
                is_whitepaper,
                is_workshop,
                is_wrapper,
                name_is_example,
                name_is_hello_world,
                name_is_whitepaper,
                name_is_tutorial,
                name_is_boilerplate,
                name_is_scaffold,
                name_is_template,
                name_is_kit,
                name_is_starter,
                name_is_getting_started,
                name_is_quickstart,
                name_is_guide,
                name_is_hackathon,
                name_is_bootcamp,
                name_is_course,
                name_is_workshop,
                name_is_interview

            FROM clean.project_repos_features
            """
        )
        repo_features_df = pd.read_sql(query, conn)
except Exception as e:
    raise Exception(f"Error fetching data from the database: {e}")

In [17]:
# --- Labeled Data Retrieval ---
gsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTbKlg5CJYNO3d2lrRbWWUy-71sr-oBApAJxCx2xmV931Y8CDrJ46SwVCUEoOT90LZsPpALVS_QixkE/pub?gid=1690796422&single=true&output=csv'
try:
    educational_df = pd.read_csv(gsheet_url)
except Exception as e:
    raise Exception(f"Error reading data from Google Sheets: {e}")

In [18]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = pd.merge(repo_features_df, educational_df, on='repo')

In [19]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = merged_df.dropna(subset=['is_educational'])

In [20]:
# print info about the merged dataframe
print(f"Merged dataframe info: {merged_df.info()}")

# print the first 5 rows of the merged dataframe
print(f"Merged dataframe first 5 rows: {merged_df.head()}")

# print the number of rows where is_dev_tooling is 1
print(f"Number of rows where is_educational is 1: {merged_df[merged_df['is_educational'] == 1].shape[0]}")

# print the number of rows where is_dev_tooling is 0
print(f"Number of rows where is_educational is 0: {merged_df[merged_df['is_educational'] == 0].shape[0]}")

# print the number of rows where is_dev_tooling is null
print(f"Number of rows where is_educational is null: {merged_df[merged_df['is_educational'].isnull()].shape[0]}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626 entries, 0 to 625
Data columns (total 59 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   repo                              626 non-null    object
 1   has_readme                        626 non-null    bool  
 2   is_collection_of_learnings        626 non-null    bool  
 3   has_app_application               626 non-null    bool  
 4   is_awesome_curated                626 non-null    bool  
 5   has_benchmark                     626 non-null    bool  
 6   is_block_explorer                 626 non-null    bool  
 7   is_boilerplate_scaffold_template  626 non-null    bool  
 8   is_bootcamp                       626 non-null    bool  
 9   is_bot                            626 non-null    bool  
 10  has_bounty_program                626 non-null    bool  
 11  has_brand_icon_logo               626 non-null    bool  
 12  is_cli_tool           

In [21]:
# Separate features (X) and the target variable (y)
feature_columns = [
    'has_readme', # if false, then not dev tooling
    'is_collection_of_learnings', # if true, then not dev tooling
    'has_app_application', # if true, then not dev tooling
    'is_awesome_curated', # if true, then not dev tooling
    'has_benchmark', # if true, then not dev tooling
    'is_block_explorer', # if true, then not dev tooling
    'is_boilerplate_scaffold_template', # if true, then not dev tooling
    'is_bootcamp', # if true, then not dev tooling
    'is_bot', # if true, then not dev tooling
    'has_bounty_program', # if true, then not dev tooling
    'has_brand_icon_logo', # if true, then not dev tooling
    'is_cli_tool', # if true, then dev tooling
    'is_library', # if true, then dev tooling
    'is_course', # if true, then not dev tooling
    'is_demo', # if true, then not dev tooling
    'has_docs', # if true, then not dev tooling
    'is_education_related', # if true, then not dev tooling
    'is_eip_erc', # if true, then not dev tooling
    'has_examples', # if true, then not dev tooling
    'is_feature_description', # if true, then not dev tooling
    'is_starter_project', # if true, then not dev tooling
    'is_guide', # if true, then not dev tooling
    'is_hackathon_project', # if true, then not dev tooling
    'is_hello_world', # if true, then not dev tooling
    'uses_json_rpc', # if true, then dev tooling
    'is_interview_related', # if true, then not dev tooling
    'is_learning_material', # if true, then not dev tooling
    'is_mcp_server', # if true, then not dev tooling
    'is_plugin', # if true, then dev tooling
    'is_sample_project', # if true, then not dev tooling
    'is_sdk', # if true, then dev tooling
    'is_security_related', # if true, then not dev tooling
    'has_tests_testing', # if true, then not dev tooling
    'has_tips', # if true, then not dev tooling
    'is_tooling', # if true, then dev tooling
    'is_tutorial', # if true, then not dev tooling
    'is_whitepaper', # if true, then not dev tooling
    'is_workshop', # if true, then not dev tooling
    'is_wrapper', # if true, then dev tooling
    'name_is_example', # if true, then not dev tooling
    'name_is_hello_world', # if true, then not dev tooling
    'name_is_whitepaper', # if true, then not dev tooling
    'name_is_tutorial', # if true, then not dev tooling
    'name_is_boilerplate', # if true, then not dev tooling
    'name_is_scaffold', # if true, then not dev tooling
    'name_is_template', # if true, then not dev tooling
    'name_is_kit', # if true, then dev tooling
    'name_is_starter', # if true, then not dev tooling
    'name_is_getting_started', # if true, then not dev tooling
    'name_is_quickstart', # if true, then not dev tooling
    'name_is_guide', # if true, then not dev tooling
    'name_is_hackathon', # if true, then not dev tooling
    'name_is_bootcamp', # if true, then not dev tooling
    'name_is_course', # if true, then not dev tooling
    'name_is_workshop', # if true, then not dev tooling
    'name_is_interview' # if true, then not dev tooling
] 

X = merged_df[feature_columns]

# Ensure all feature data is numeric (booleans will be treated as 0s and 1s)
X = X.astype(float)

y = merged_df['is_educational']

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
# --- Model Training ---
# Initialize and train a simple classification model (Logistic Regression)
model = LogisticRegression(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [24]:
## ----------------------------------------------------- Model Evaluation ------------------------------------------------- ##

In [25]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [26]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.8095

Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.92      0.89       103
        True       0.47      0.30      0.37        23

    accuracy                           0.81       126
   macro avg       0.66      0.61      0.63       126
weighted avg       0.78      0.81      0.79       126



In [28]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not educational")
print(f"False Positives (FP): {fp} - Incorrectly predicted as educational")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not educational (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as educational")

Confusion Matrix:
                 Predicted
                 False    True
Actual False    95       8       
       True     16       7       


True Negatives (TN): 95 - Correctly predicted not educational
False Positives (FP): 8 - Incorrectly predicted as educational
False Negatives (FN): 16 - Incorrectly predicted as not educational (missed)
True Positives (TP): 7 - Correctly predicted as educational


In [36]:
# Find the actual rows that were false negatives
# Create a boolean mask where the actual value was 1 but the prediction was 0
fn_mask = (y_test == 1) & (y_pred == 0)

# Use this mask to filter the original X_test DataFrame
false_negative_rows = X_test[fn_mask]

# use the index from X_test fn rows to lookup in the original df
original_fn_rows = merged_df.loc[false_negative_rows.index]

with pd.option_context('display.max_columns', None):
    print("\n--- Rows from the test set that were False Negatives ---")
    print(original_fn_rows)


--- Rows from the test set that were False Negatives ---
                                                  repo  has_readme  \
363  https://github.com/alchemyplatform/aa-sdk-userops       False   
2    https://github.com/Ackee-Blockchain/school-of-...       False   
246              https://github.com/bitpay/bitpay-lang        True   
22      https://github.com/Consensys/infura-onboarding       False   
549             https://github.com/alchemyplatform/lab        True   
212          https://github.com/MeteoraAg/cpi-examples        True   
269                      https://github.com/079035/PWN        True   
229  https://github.com/ava-labs/avalanche-starter-kit        True   
600  https://github.com/marcocastignoli/metacoin-so...        True   
496   https://github.com/Ankr-network/ankr-docs-nextra        True   
41   https://github.com/alchemyplatform/get-transac...       False   
40      https://github.com/maweiche/scoreboard_program        True   
402      https://github.com/Cons

In [43]:
# X_train is our feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Get the importance (coefficients)
importance = model.coef_[0]

# Get the feature names from original DataFrame
feature_names = X_train.columns

# Create a pandas Series to pair feature names with their coefficients
coefficients = pd.Series(importance, index=feature_names)

# Sort the coefficients to see the most influential features
sorted_coefficients = coefficients.sort_values(ascending=False)

print("--- Most Influential Features ---")
print(sorted_coefficients)

--- Most Influential Features ---
is_tutorial                         0.702626
is_awesome_curated                  0.639909
name_is_example                     0.555791
has_examples                        0.533405
is_sdk                              0.436443
is_hello_world                      0.409113
is_starter_project                  0.379107
is_demo                             0.372132
is_sample_project                   0.362215
name_is_quickstart                  0.358789
is_feature_description              0.354419
name_is_hackathon                   0.323185
name_is_whitepaper                  0.314625
is_learning_material                0.312939
name_is_interview                   0.272130
is_course                           0.240639
is_education_related                0.189767
has_docs                            0.175075
is_bootcamp                         0.174206
is_hackathon_project                0.128153
name_is_hello_world                 0.127749
name_is_tutorial     