In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [30]:
df = pd.read_csv("RPGF3Results.csv")
df = df[df['Category: Developer Ecosystem'] == 1]
df.shape

(305, 58)

In [31]:
df.drop(columns=['Project_ID', 'Result: Quorum Reached', 'Meta: Applicant Type','Meta: Website','Meta: Bio','Meta: Payout Address',	'Category: Collective Governance','Category: Developer Ecosystem','Category: End User Experience and Adoption','Category: OP Stack','Keywords: Base','Keywords: Farcaster','Keywords: Zora','Link: Contract on Base','Link: Contract on OP Mainnet','Link: Dune','Link: Flipside','Link: GitHub','Link: GitHub (duneanalytics)','Link: GitHub (ethereum)','Link: GitHub (ethereum-optimism)','Link: NPM Package','Link: Optimism Gov','Link: Substack','Link: Twitter','GTP: VC Funding Amount','GTP: Has Token', 'GTP: Has VC Funding','Meta: Project Image'], inplace=True)
df.to_csv("Cleaned_Retro.csv")

Machine Learning Support Vector Regression (SVR)

In [32]:
df_SVR = df[['Result: # Ballots', 'Result: Received OP','OSO: # GitHub Repos','OSO: Total Stars','OSO: Total Forks','OSO: Total Contributors','OSO: Contributors Last 6 Months','OSO: Avg Monthly Active Devs Last 6 Months', 'OSO: Total Onchain Users', 'OSO: Onchain Users Last 6 Months', 'OSO: Total Downloads', 'OSO: Downloads Last 6 Months']]	
df_SVR.fillna(0, inplace=True)
# df_SVR.reset_index(inplace=True)
df_SVR.to_csv("RetroML.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_SVR.fillna(0, inplace=True)


In [33]:
sample_data = {
    'Result: # Ballots': [110, 105, 98, 115, 99],
    'OSO: # GitHub Repos': [500.0, 120.0, 50.0, 250.0, 300.0],
    'OSO: Total Stars': [55000.0, 45000.0, 32000.0, 48000.0, 60000.0],
    'OSO: Total Forks': [20000.0, 18000.0, 15000.0, 22000.0, 19000.0],
    'OSO: Total Contributors': [25000.0, 20000.0, 18000.0, 23000.0, 26000.0],
    'OSO: Contributors Last 6 Months': [2000.0, 1500.0, 1200.0, 1800.0, 2200.0],
    'OSO: Avg Monthly Active Devs Last 6 Months': [70.0, 55.0, 40.0, 65.0, 80.0],
    'OSO: Total Onchain Users': [5000.0, 3000.0, 2000.0, 4000.0, 6000.0],
    'OSO: Onchain Users Last 6 Months': [2000.0, 1000.0, 800.0, 1500.0, 2500.0],
    'OSO: Total Downloads': [20000000.0, 15000000.0, 12000000.0, 18000000.0, 22000000.0],
    'OSO: Downloads Last 6 Months': [8000000.0, 6000000.0, 5000000.0, 7000000.0, 9000000.0]
}

new_data = pd.DataFrame(sample_data)

## Bad Model SVR

In [36]:
X = df_SVR.drop(columns=['Result: Received OP'])
y = df_SVR['Result: Received OP']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svr_model = SVR(kernel='sigmoid')

svr_model.fit(X_train_scaled, y_train)

new_data_scaled = scaler.transform(new_data)

new_predictions = svr_model.predict(new_data_scaled)

print("Predictions for 'Received OP' on new data:")
print(new_predictions)

y_pred_train = svr_model.predict(X_train_scaled)
y_pred_test = svr_model.predict(X_test_scaled)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Training Set Metrics:")
print("Mean Squared Error (MSE):", mse_train)
print("Mean Absolute Error (MAE):", mae_train)
print("R-squared (R2):", r2_train)

print("\nTest Set Metrics:")
print("Mean Squared Error (MSE):", mse_test)
print("Mean Absolute Error (MAE):", mae_test)
print("R-squared (R2):", r2_test)

Predictions for 'Received OP' on new data:
[44802.63576242 44804.18057438 44800.98873747 44805.04099013
 44802.914014  ]
Training Set Metrics:
Mean Squared Error (MSE): 6774633430.581138
Mean Absolute Error (MAE): 47169.547337766264
R-squared (R2): -0.04620125723357105

Test Set Metrics:
Mean Squared Error (MSE): 4198733630.7469563
Mean Absolute Error (MAE): 44272.591723964455
R-squared (R2): -0.043595602105611864


In [37]:
from sklearn.ensemble import RandomForestRegressor

## Good Model Random Forest

In [38]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf_model = RandomForestRegressor(n_estimators=200, random_state=42)

rf_model.fit(X_scaled, y)

new_data_scaled = scaler.transform(new_data)

new_predictions = rf_model.predict(new_data_scaled)

print("Predictions for 'Received OP' on new data using Random Forest Regression:")
print(new_predictions)
y_pred = rf_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Predictions for 'Received OP' on new data using Random Forest Regression:
[567058.9369  541621.5789  431042.9252  567067.645   539623.50655]
Mean Squared Error (MSE): 303736035.4864538
Root Mean Squared Error (RMSE): 17428.02442867389
Mean Absolute Error (MAE): 11903.241086456073
R-squared (R2): 0.9245063824688811


## lightgbm Model

In [17]:
import lightgbm as lgb

In [40]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lgb_model = lgb.LGBMRegressor()

lgb_model.fit(X_scaled, y)

new_data_scaled = scaler.transform(new_data)

new_predictions = lgb_model.predict(new_data_scaled)

print("Predictions for 'Received OP' on new data using LightGBM Regression:")
print(new_predictions)
y_pred = lgb_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 305, number of used features: 11
[LightGBM] [Info] Start training from score 61288.599747
Predictions for 'Received OP' on new data using LightGBM Regression:
[315779.45118311 315779.45118311 315779.45118311 315779.45118311
 315779.45118311]
Mean Squared Error (MSE): 613721173.6903449
Root Mean Squared Error (RMSE): 24773.396490799256
Mean Absolute Error (MAE): 16665.35980508561
R-squared (R2): 0.8474595499242477


## XGBoost Model

In [19]:
import xgboost as xgb

In [41]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

xgb_model = xgb.XGBRegressor()

xgb_model.fit(X_scaled, y)

new_data_scaled = scaler.transform(new_data)
new_predictions = xgb_model.predict(new_data_scaled)

print("Predictions for 'Received OP' on new data using XGBoost Regression:")
print(new_predictions)
y_pred = xgb_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

mae = mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Predictions for 'Received OP' on new data using XGBoost Regression:
[662919.94 634403.06 484263.75 634976.06 662897.44]
Mean Squared Error (MSE): 346060429.1243313
Root Mean Squared Error (RMSE): 18602.699511746443
Mean Absolute Error (MAE): 11872.252508044632
R-squared (R2): 0.9139866508195985
