In [None]:
# train a simple classification model to predict if a repo is developer tooling or not

## random forest

In [26]:
# run imports
from sqlalchemy import text
import sqlalchemy
import psycopg2
import os
import pandas as pd
from sqlalchemy import create_engine, text
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline # Use a pipeline to prevent data leakage
import os

In [27]:
# --- Database Connection and Feature Retrieval ---
cloud_sql_user = os.getenv("cloud_sql_user")
cloud_sql_password = os.getenv("cloud_sql_password")
cloud_sql_postgres_host = os.getenv("cloud_sql_postgres_host")
cloud_sql_postgres_db = os.getenv("cloud_sql_postgres_db")

# Construct the connection string
conn_str = (
    f"postgresql+psycopg2://"
    f"{cloud_sql_user}:{cloud_sql_password}@"
    f"{cloud_sql_postgres_host}/{cloud_sql_postgres_db}"
)

# Create the SQLAlchemy engine
try:
    cloud_sql_engine = create_engine(conn_str)
except Exception as e:
    raise Exception(f"Error creating database engine: {e}")

In [28]:
# Fetch the feature dataset from the database
try:
    with cloud_sql_engine.connect() as conn:
        query = text(
            """
            SELECT *
            FROM clean.project_repos_features
            """
        )
        repo_features_df = pd.read_sql(query, conn)
except Exception as e:
    raise Exception(f"Error fetching data from the database: {e}")

In [29]:
# --- Labeled Data Retrieval ---
gsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSTIjEmhgSpvITvd8BdnttCmGD05bylP9PDZW0WaeahdL0C2Fxfh5dZcd1-EmhbP_M2BJydgA81aKy1/pub?gid=1690796422&single=true&output=csv'
try:
    dev_tooling_df = pd.read_csv(gsheet_url)
except Exception as e:
    raise Exception(f"Error reading data from Google Sheets: {e}")

In [30]:
# --- Data Preparation ---
# Merge the feature data with the labeled data
merged_df = pd.merge(repo_features_df, dev_tooling_df, on='repo')

In [31]:
# drop rows where is_educational is null
merged_df = merged_df.dropna(subset=['is_dev_tooling'])

In [32]:
# print info about the merged dataframe
print(f"Merged dataframe info: {merged_df.info()}")

# print the first 5 rows of the merged dataframe
print(f"Merged dataframe first 5 rows: {merged_df.head()}")

# print the number of rows where is_dev_tooling is 1
print(f"Number of rows where is_dev_tooling is 1: {merged_df[merged_df['is_dev_tooling'] == 1].shape[0]}")

# print the number of rows where is_dev_tooling is 0
print(f"Number of rows where is_dev_tooling is 0: {merged_df[merged_df['is_dev_tooling'] == 0].shape[0]}")

# print the number of rows where is_dev_tooling is null
print(f"Number of rows where is_dev_tooling is null: {merged_df[merged_df['is_dev_tooling'].isnull()].shape[0]}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588 entries, 0 to 587
Data columns (total 75 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   repo                              588 non-null    object        
 1   data_timestamp                    588 non-null    datetime64[ns]
 2   is_fork                           588 non-null    bool          
 3   is_tooling                        588 non-null    bool          
 4   is_wrapper                        588 non-null    bool          
 5   name_is_interview                 588 non-null    bool          
 6   has_brand_icon_logo               588 non-null    bool          
 7   is_learning_material              588 non-null    bool          
 8   name_is_hackathon                 588 non-null    bool          
 9   is_tutorial                       588 non-null    bool          
 10  is_experiment                     588 non-null    

In [33]:
# Separate features (X) and the target variable (y)
feature_columns = [
    'has_readme', # if false, then false
    'has_description', # if false, then false
    'is_collection_of_learnings',
    'has_app_application', 
    'is_awesome_curated', 
    'has_benchmark', 
    'is_block_explorer', 
    'is_boilerplate_scaffold_template', # if true, then true
    'is_bootcamp', 
    'is_bot', 
    'has_bounty_program', 
    'has_brand_icon_logo', 
    'is_cli_tool', # if true, then true
    'is_library', # if true, then true
    'is_course', 
    'is_demo', 
    'has_docs', 
    'is_education_related', 
    'is_eip_erc', 
    'has_examples', 
    'is_feature_description', 
    'is_starter_project', 
    'is_guide', 
    'is_hackathon_project', 
    'is_hello_world', 
    'uses_json_rpc', # if true, then true; to do: update training data to point rpc to infra instead of dev tooling
    'is_interview_related', 
    'is_learning_material', 
    'is_mcp_server', 
    'is_plugin', # if true, then true
    'is_sample_project', 
    'is_sdk', # if true, then true
    'is_security_related', 
    'has_tests_testing', 
    'has_tips', 
    'is_tooling', # if true, then true
    'is_tutorial', 
    'is_whitepaper', 
    'is_workshop', 
    'is_wrapper', 
    'is_experiment',
    'is_research',
    'is_fork',
    'name_is_example', 
    'name_is_hello_world', 
    'name_is_whitepaper', 
    'name_is_tutorial', 
    'name_is_boilerplate', # if true, then true
    'name_is_scaffold', # if true, then true
    'name_is_template', # if true, then true
    'name_is_kit', # if true, then true
    'name_is_starter', 
    'name_is_getting_started', 
    'name_is_quickstart', 
    'name_is_guide', 
    'name_is_hackathon', 
    'name_is_bootcamp', 
    'name_is_course', 
    'name_is_workshop', 
    'name_is_interview', 
    'pm_has_main_entrypoint', # if true, then true
    'pm_has_bin_script', # if true, then true
    'pm_has_dependencies', # if true, then true
    'pm_has_version_control', # if true, then true
    'pm_has_author_cited', # if true, then true
    'pm_has_license', # if true, then true
    'pm_has_repository', # if true, then true
    # for dev tooling we introduce predicted values from other classification models
    # note, this means order of execution matters
    'predicted_is_scaffold',
    'predicted_is_educational'
] 

X = merged_df[feature_columns]

# Ensure all feature data is numeric (booleans will be treated as 0s and 1s)
X = X.astype(float)

y = merged_df['is_dev_tooling']

In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
## ----------------------------------------------------- Model Training ------------------------------------------------- ##

In [36]:
# set the n_estimators param
n_estimators = 750

In [37]:
# Initialize and train a Random Forest model
# n_estimators is the number of trees in the forest
model_balanced = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=n_estimators)
model_balanced.fit(X_train, y_train)

In [38]:
## ----------------------------------------------------- Model Evaluation ------------------------------------------------- ##

In [39]:
# Make predictions on the test set
y_pred_balanced = model_balanced.predict(X_test)

In [40]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_balanced)
print(f"Model Accuracy (balanced model): {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report (balanced model):")
print(classification_report(y_test, y_pred_balanced))

Model Accuracy (balanced model): 0.8729

Classification Report (balanced model):
              precision    recall  f1-score   support

       False       0.89      0.94      0.92        89
        True       0.79      0.66      0.72        29

    accuracy                           0.87       118
   macro avg       0.84      0.80      0.82       118
weighted avg       0.87      0.87      0.87       118



In [41]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_balanced)

print("Confusion Matrix (balanced model)")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not dev tooling")
print(f"False Positives (FP): {fp} - Incorrectly predicted as dev tooling")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not dev tooling (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as dev tooling")

Confusion Matrix (balanced model)
                 Predicted
                 False    True
Actual False    84       5       
       True     10       19      


True Negatives (TN): 84 - Correctly predicted not dev tooling
False Positives (FP): 5 - Incorrectly predicted as dev tooling
False Negatives (FN): 10 - Incorrectly predicted as not dev tooling (missed)
True Positives (TP): 19 - Correctly predicted as dev tooling


In [42]:
## ---------------------------- feature importance

In [43]:
# Get importance scores
importance = model_balanced.feature_importances_

# Get the column names from training data
feature_names = X_train.columns

# Create a pandas Series to pair feature names with their importance scores
feat_importances = pd.Series(importance, index=feature_names)

# Sort the Series in descending order (most important features first)
sorted_importances = feat_importances.sort_values(ascending=False)

# 4. Print the sorted list
print("--- Feature Importances (Sorted) ---")
print(sorted_importances)

--- Feature Importances (Sorted) ---
predicted_is_educational      0.094141
predicted_is_scaffold         0.089532
pm_has_repository             0.071577
is_fork                       0.045256
has_description               0.043933
                                ...   
name_is_hello_world           0.000060
is_collection_of_learnings    0.000000
name_is_getting_started       0.000000
name_is_workshop              0.000000
name_is_guide                 0.000000
Length: 69, dtype: float64


In [44]:
## ------------------------------------------------------ tuning -------------------------------------------------- ##

In [45]:
# --- Model Evaluation with Adjusted Threshold ---

# Get the predicted probabilities for the 'True' class
# The second column [:, 1] corresponds to the probability of the positive class (True)
y_pred_proba = model_balanced.predict_proba(X_test)[:, 1]

# Set a new, lower threshold
# Start with something like 0.4 and see how it changes the result
new_threshold = 0.4

# Classify as True if the probability is above the new threshold
y_pred_adjusted = (y_pred_proba >= new_threshold).astype(int)


# --- re-evaluate using the adjusted predictions ---
print(f"Results for threshold = {new_threshold}\n")

# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_adjusted))

# Print a confusion matrix
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred_adjusted)

print("Confusion Matrix (w/lower threshold for match):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not dev tooling")
print(f"False Positives (FP): {fp} - Incorrectly predicted as dev tooling")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not dev tooling (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as dev tooling")

Results for threshold = 0.4

Model Accuracy: 0.8475

Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.88      0.90        89
        True       0.67      0.76      0.71        29

    accuracy                           0.85       118
   macro avg       0.79      0.82      0.80       118
weighted avg       0.86      0.85      0.85       118

Confusion Matrix:
Confusion Matrix (w/lower threshold for match):
                 Predicted
                 False    True
Actual False    78       11      
       True     7        22      


True Negatives (TN): 78 - Correctly predicted not dev tooling
False Positives (FP): 11 - Incorrectly predicted as dev tooling
False Negatives (FN): 7 - Incorrectly predicted as not dev tooling (missed)
True Positives (TP): 22 - Correctly predicted as dev tooling


In [46]:
# Try giving the minority class ('True') a much higher weight
# The ratio is ~4:1, so let's try a weight of 4 or 5 for the 'True' class.
model_manual_weights = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 5}, n_estimators=n_estimators)
model_manual_weights.fit(X_train, y_train)

# Make predictions on the test set
y_pred_manual_weight = model_manual_weights.predict(X_test)

# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_manual_weight)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_manual_weight))

# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_manual_weight)

print("Confusion Matrix (with manual weights):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not dev tooling")
print(f"False Positives (FP): {fp} - Incorrectly predicted as dev tooling")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not dev tooling (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as dev tooling")

Model Accuracy: 0.8475

Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.91      0.90        89
        True       0.70      0.66      0.68        29

    accuracy                           0.85       118
   macro avg       0.80      0.78      0.79       118
weighted avg       0.84      0.85      0.85       118

Confusion Matrix (with manual weights):
                 Predicted
                 False    True
Actual False    81       8       
       True     10       19      


True Negatives (TN): 81 - Correctly predicted not dev tooling
False Positives (FP): 8 - Incorrectly predicted as dev tooling
False Negatives (FN): 10 - Incorrectly predicted as not dev tooling (missed)
True Positives (TP): 19 - Correctly predicted as dev tooling


In [47]:
## ------------------------------------------ does smote improve the results? ------------------------------- ##

In [48]:
print("Checking for NaN values...")
nan_counts = X_train.isnull().sum()

# Print columns that HAVE NaN values
print(nan_counts[nan_counts > 0])

Checking for NaN values...
Series([], dtype: int64)


In [49]:
# Define the model (again) -- this time leave out class_weight
model_smote = RandomForestClassifier(random_state=42, n_estimators=n_estimators) # No need for class_weight here

# Define the resampling method
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Create a pipeline that first applies SMOTE, then trains the model
pipeline = Pipeline([('smote', smote), ('classifier', model_smote)])

# Train the model on the resampled data
pipeline.fit(X_train, y_train)

# Now use the pipeline to predict
y_pred_smote = pipeline.predict(X_test)

In [50]:
# Calculate and print the model's accuracy
accuracy = accuracy_score(y_test, y_pred_smote)
print(f"Model Accuracy: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_smote))

Model Accuracy: 0.8644

Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.92      0.91        89
        True       0.74      0.69      0.71        29

    accuracy                           0.86       118
   macro avg       0.82      0.81      0.81       118
weighted avg       0.86      0.86      0.86       118



In [51]:
# --- Confusion Matrix with Labels ---
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred_smote)

print("Confusion Matrix (using smote):")
print("                 Predicted")
print("                 False    True")
print("Actual False    {:<8} {:<8}".format(cm[0][0], cm[0][1]))
print("       True     {:<8} {:<8}".format(cm[1][0], cm[1][1]))
print("\n")

# Explanation of the terms
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn} - Correctly predicted not scaffold")
print(f"False Positives (FP): {fp} - Incorrectly predicted as scaffold")
print(f"False Negatives (FN): {fn} - Incorrectly predicted as not scaffold (missed)")
print(f"True Positives (TP): {tp} - Correctly predicted as scaffold")

Confusion Matrix (using smote):
                 Predicted
                 False    True
Actual False    82       7       
       True     9        20      


True Negatives (TN): 82 - Correctly predicted not scaffold
False Positives (FP): 7 - Incorrectly predicted as scaffold
False Negatives (FN): 9 - Incorrectly predicted as not scaffold (missed)
True Positives (TP): 20 - Correctly predicted as scaffold
