In [None]:
# train a simple classification model to predict if a repo is educational or not

## random forest

In [48]:
# run imports
from sqlalchemy import text
import sqlalchemy
import psycopg2
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline # Use a pipeline to prevent data leakage
import os

In [49]:
# --- Database Connection and Feature Retrieval ---
cloud_sql_user = os.getenv("cloud_sql_user")
cloud_sql_password = os.getenv("cloud_sql_password")
cloud_sql_postgres_host = os.getenv("cloud_sql_postgres_host")
cloud_sql_postgres_db = os.getenv("cloud_sql_postgres_db")

# Construct the connection string
conn_str = (
    f"postgresql+psycopg2://"
    f"{cloud_sql_user}:{cloud_sql_password}@"
    f"{cloud_sql_postgres_host}/{cloud_sql_postgres_db}"
)

# Create the SQLAlchemy engine
try:
    cloud_sql_engine = create_engine(conn_str)
except Exception as e:
    raise Exception(f"Error creating database engine: {e}")

In [50]:
# Fetch the feature dataset from the database
try:
    with cloud_sql_engine.connect() as conn:
        query = text(
            """
            SELECT 
                repo,
                has_readme,
                is_collection_of_learnings,
                has_app_application,
                is_awesome_curated,
                has_benchmark,
                is_block_explorer,
                is_boilerplate_scaffold_template,
                is_bootcamp,
                is_bot,
                has_bounty_program,
                has_brand_icon_logo,
                is_cli_tool,
                is_library,
                is_course,
                is_demo,
                has_docs,
                is_education_related,
                is_eip_erc,
                has_examples,
                is_feature_description,
                is_starter_project,
                is_guide,
                is_hackathon_project,
                is_hello_world,
                uses_json_rpc,
                is_interview_related,
                is_learning_material,
                is_mcp_server,
                is_plugin,
                is_sample_project,
                is_sdk,
                is_security_related,
                has_tests_testing,
                has_tips,
                is_tooling,
                is_tutorial,
                is_whitepaper,
                is_workshop,
                is_wrapper,
                is_experiment,
                is_research,
                name_is_example,
                name_is_hello_world,
                name_is_whitepaper,
                name_is_tutorial,
                name_is_boilerplate,
                name_is_scaffold,
                name_is_template,
                name_is_kit,
                name_is_starter,
                name_is_getting_started,
                name_is_quickstart,
                name_is_guide,
                name_is_hackathon,
                name_is_bootcamp,
                name_is_course,
                name_is_workshop,
                name_is_interview

            FROM clean.project_repos_features
            """
        )
        repo_features_df = pd.read_sql(query, conn)
except Exception as e:
    raise Exception(f"Error fetching data from the database: {e}")

In [51]:
# --- Labeled Data Retrieval ---
gsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTbKlg5CJYNO3d2lrRbWWUy-71sr-oBApAJxCx2xmV931Y8CDrJ46SwVCUEoOT90LZsPpALVS_QixkE/pub?gid=1690796422&single=true&output=csv'
try:
    educational_df = pd.read_csv(gsheet_url)
except Exception as e:
    raise Exception(f"Error reading data from Google Sheets: {e}")

In [52]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = pd.merge(repo_features_df, educational_df, on='repo')

In [53]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = merged_df.dropna(subset=['is_educational'])

In [54]:
# print info about the merged dataframe
print(f"Merged dataframe info: {merged_df.info()}")

# print the first 5 rows of the merged dataframe
print(f"Merged dataframe first 5 rows: {merged_df.head()}")

# print the number of rows where is_dev_tooling is 1
print(f"Number of rows where is_educational is 1: {merged_df[merged_df['is_educational'] == 1].shape[0]}")

# print the number of rows where is_dev_tooling is 0
print(f"Number of rows where is_educational is 0: {merged_df[merged_df['is_educational'] == 0].shape[0]}")

# print the number of rows where is_dev_tooling is null
print(f"Number of rows where is_educational is null: {merged_df[merged_df['is_educational'].isnull()].shape[0]}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 61 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   repo                              640 non-null    object
 1   has_readme                        640 non-null    bool  
 2   is_collection_of_learnings        640 non-null    bool  
 3   has_app_application               640 non-null    bool  
 4   is_awesome_curated                640 non-null    bool  
 5   has_benchmark                     640 non-null    bool  
 6   is_block_explorer                 640 non-null    bool  
 7   is_boilerplate_scaffold_template  640 non-null    bool  
 8   is_bootcamp                       640 non-null    bool  
 9   is_bot                            640 non-null    bool  
 10  has_bounty_program                640 non-null    bool  
 11  has_brand_icon_logo               640 non-null    bool  
 12  is_cli_tool           

In [55]:
# Separate features (X) and the target variable (y)
feature_columns = [
    'has_readme', # if false, then not education
    'is_collection_of_learnings', # if true, then is education
    'has_app_application', # if true, then not education
    'is_awesome_curated', # if true, then is education
    'has_benchmark', # if true, then not education
    'is_block_explorer', # if true, then not education
    'is_boilerplate_scaffold_template', # if true, then not education
    'is_bootcamp', # if true, then is education
    'is_bot', # if true, then not education
    'has_bounty_program', # if true, then not education
    'has_brand_icon_logo', # if true, then not education
    'is_cli_tool', # if true, then not education
    'is_library', # if true, then not education
    'is_course', # if true, then education
    'is_demo', # if true, then education
    'has_docs', # if true, then education
    'is_education_related', # if true, then education
    'is_eip_erc', # if true, then not education
    'has_examples', # if true, then education
    'is_feature_description', # if true, then education
    'is_starter_project', # if true, then education
    'is_guide', # if true, then education
    'is_hackathon_project', # if true, then education
    'is_hello_world', # if true, then education
    'uses_json_rpc', # if true, then not education
    'is_interview_related', # if true, then education
    'is_learning_material', # if true, then education
    'is_mcp_server', # if true, then not education
    'is_plugin', # if true, then not education
    'is_sample_project', # if true, then education
    'is_sdk', # if true, then not education
    'is_security_related', # if true, then not education
    'has_tests_testing', # if true, then not education
    'has_tips', # if true, then education
    'is_tooling', # if true, then not education
    'is_tutorial', # if true, then education
    'is_whitepaper', # if true, then education
    'is_workshop', # if true, then education
    'is_wrapper', # if true, then not education
    'is_experiment', # if true, then education
    'is_research', # if true, then education
    'name_is_example', # if true, then education
    'name_is_hello_world', # if true, then education
    'name_is_whitepaper', # if true, then education
    'name_is_tutorial', # if true, then education
    'name_is_boilerplate', # if true, then not education
    'name_is_scaffold', # if true, then not education
    'name_is_template', # if true, then not education
    'name_is_kit', # if true, then not education
    'name_is_starter', # if true, then education
    'name_is_getting_started', # if true, then education
    'name_is_quickstart', # if true, then education
    'name_is_guide', # if true, then education
    'name_is_hackathon', # if true, then education
    'name_is_bootcamp', # if true, then education
    'name_is_course', # if true, then education
    'name_is_workshop', # if true, then education
    'name_is_interview' # if true, then education
] 

X = merged_df[feature_columns]

# Ensure all feature data is numeric (booleans will be treated as 0s and 1s)
X = X.astype(float)

y = merged_df['is_educational']

In [56]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [57]:
## ----------------------------------------------------- Model Training ------------------------------------------------- ##

In [58]:
# set the n_estimators param
n_estimators = 500

In [59]:
# Initialize and train a Random Forest model
# n_estimators is the number of trees in the forest
model_balanced = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=n_estimators)
model_balanced.fit(X_train, y_train)

In [60]:
## ----------------------------------------------------- Model Evaluation ------------------------------------------------- ##

In [61]:
# Make predictions on the test set
y_pred_class_weight_balanced = model_balanced.predict(X_test)

In [62]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_class_weight_balanced)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_class_weight_balanced))

Model Accuracy: 0.8281

Classification Report:
              precision    recall  f1-score   support

       False       0.83      0.98      0.90        98
        True       0.83      0.33      0.48        30

    accuracy                           0.83       128
   macro avg       0.83      0.66      0.69       128
weighted avg       0.83      0.83      0.80       128



In [63]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_class_weight_balanced)

print("Confusion Matrix (class weight = balanced):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not educational")
print(f"False Positives (FP): {fp} - Incorrectly predicted as educational")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not educational (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as educational")

Confusion Matrix (class weight = balanced):
                 Predicted
                 False    True
Actual False    96       2       
       True     20       10      


True Negatives (TN): 96 - Correctly predicted not educational
False Positives (FP): 2 - Incorrectly predicted as educational
False Negatives (FN): 20 - Incorrectly predicted as not educational (missed)
True Positives (TP): 10 - Correctly predicted as educational


In [64]:
# Get importance scores
importance = model_balanced.feature_importances_

# Get the column names from training data
feature_names = X_train.columns

# Create a pandas Series to pair feature names with their importance scores
feat_importances = pd.Series(importance, index=feature_names)

# Sort the Series in descending order (most important features first)
sorted_importances = feat_importances.sort_values(ascending=False)

# 4. Print the sorted list
print("--- Feature Importances (Sorted) ---")
print(sorted_importances)

--- Feature Importances (Sorted) ---
is_bootcamp                         0.068801
is_demo                             0.049629
has_examples                        0.047536
name_is_tutorial                    0.047356
has_app_application                 0.045976
has_docs                            0.045536
has_readme                          0.042858
has_tests_testing                   0.040021
name_is_bootcamp                    0.039943
is_library                          0.037490
name_is_interview                   0.033234
name_is_example                     0.032555
is_cli_tool                         0.030680
is_starter_project                  0.028583
is_tooling                          0.027334
is_sample_project                   0.026928
is_security_related                 0.026628
is_learning_material                0.024780
is_eip_erc                          0.022396
is_course                           0.022204
is_tutorial                         0.019298
is_education_relat

In [65]:
# Get the predicted probabilities for the test set
# We only need the probability of the positive class (class 1)
y_pred_proba = model_balanced.predict_proba(X_test)
positive_class_proba = y_pred_proba[:, 1]

# Set a new, lower threshold to be more "aggressive"
new_threshold = 0.4  # Example: predict True if probability is 40% or more

# Apply the new threshold to get aggressive predictions
y_pred_aggressive = (positive_class_proba >= new_threshold).astype(int)

print("\n--- Aggressive Predictions (Threshold = 0.4) ---")
cm = confusion_matrix(y_test, y_pred_aggressive)
print(classification_report(y_test, y_pred_aggressive))

print("Confusion Matrix (Lowered threshold = 0.4):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not educational")
print(f"False Positives (FP): {fp} - Incorrectly predicted as educational")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not educational (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as educational")


--- Aggressive Predictions (Threshold = 0.4) ---
              precision    recall  f1-score   support

       False       0.84      0.97      0.90        98
        True       0.80      0.40      0.53        30

    accuracy                           0.84       128
   macro avg       0.82      0.68      0.72       128
weighted avg       0.83      0.84      0.81       128

Confusion Matrix (Lowered threshold = 0.4):
                 Predicted
                 False    True
Actual False    95       3       
       True     18       12      


True Negatives (TN): 95 - Correctly predicted not educational
False Positives (FP): 3 - Incorrectly predicted as educational
False Negatives (FN): 18 - Incorrectly predicted as not educational (missed)
True Positives (TP): 12 - Correctly predicted as educational


In [None]:
## ------------------------------------------ does smote improve the results? ------------------------------- ##

In [66]:
# Define the model (again) -- this time leave out class_weight
model_smote = RandomForestClassifier(random_state=42, n_estimators=n_estimators) # No need for class_weight here

# Define the resampling method
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Create a pipeline that first applies SMOTE, then trains the model
pipeline = Pipeline([('smote', smote), ('classifier', model_smote)])

# Train the model on the resampled data
pipeline.fit(X_train, y_train)

# Now use the pipeline to predict
y_pred_smote = pipeline.predict(X_test)

In [67]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_smote)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_smote))

Model Accuracy: 0.8359

Classification Report:
              precision    recall  f1-score   support

       False       0.83      0.99      0.90        98
        True       0.91      0.33      0.49        30

    accuracy                           0.84       128
   macro avg       0.87      0.66      0.70       128
weighted avg       0.85      0.84      0.81       128



In [68]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_smote)

print("Confusion Matrix (using smote):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not educational")
print(f"False Positives (FP): {fp} - Incorrectly predicted as educational")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not educational (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as educational")

Confusion Matrix (using smote):
                 Predicted
                 False    True
Actual False    97       1       
       True     20       10      


True Negatives (TN): 97 - Correctly predicted not educational
False Positives (FP): 1 - Incorrectly predicted as educational
False Negatives (FN): 20 - Incorrectly predicted as not educational (missed)
True Positives (TP): 10 - Correctly predicted as educational


In [None]:
## ------------------------------------------------ does penalizing false negatives help? ------------------------ ##

In [69]:
# Make mistakes on the positive class 5 times more costly than mistakes on the negative class
manual_weights = {0: 1, 1: 5}

model_penalize = RandomForestClassifier(
    random_state=42,
    class_weight=manual_weights, # Use the manual weights
    n_estimators=n_estimators
)

model_penalize.fit(X_train, y_train)

# make predictions for printing
y_pred_penalize = model_penalize.predict(X_test)

In [70]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_penalize)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_penalize))

Model Accuracy: 0.8281

Classification Report:
              precision    recall  f1-score   support

       False       0.83      0.98      0.90        98
        True       0.83      0.33      0.48        30

    accuracy                           0.83       128
   macro avg       0.83      0.66      0.69       128
weighted avg       0.83      0.83      0.80       128



In [71]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_penalize)

print("Confusion Matrix (penalize):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not educational")
print(f"False Positives (FP): {fp} - Incorrectly predicted as educational")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not educational (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as educational")

Confusion Matrix (penalize):
                 Predicted
                 False    True
Actual False    96       2       
       True     20       10      


True Negatives (TN): 96 - Correctly predicted not educational
False Positives (FP): 2 - Incorrectly predicted as educational
False Negatives (FN): 20 - Incorrectly predicted as not educational (missed)
True Positives (TP): 10 - Correctly predicted as educational
