In [1]:
import os
import pandas as pd
from config import DATA_DIR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Baseline model for first-time donors

In [2]:
df_all_donors = pd.read_csv(os.path.join(DATA_DIR, 'sample_contributions.csv'))
df = df_all_donors[df_all_donors['Repeat_Donor'] != True]

### Linear Regression Baseline

In [3]:
def simple_preprocessing(df):
    scale_factor = df['amount'].astype(np.float64).max() 
    df = df[['scaled_amount', 'date', 'contributor.cfscore', 'candidate.cfscore']].copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    df['contributor.cfscore'].fillna(df['contributor.cfscore'].mean(), inplace=True)
    df['candidate.cfscore'].fillna(df['candidate.cfscore'].mean(), inplace=True)

    cap_threshold = 5000.0/scale_factor
    df['amount_capped'] = df['scaled_amount'].apply(lambda x: min(x, cap_threshold))
    
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    
    df.dropna(subset=['date'], inplace=True)

    return df

In [4]:
df = simple_preprocessing(df)

In [5]:
X = df[['contributor.cfscore', 'candidate.cfscore', 'year', 'month']]
y = df['amount_capped']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

0.003068870609757294

### Tree-based Models Baseline

In [6]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_rmse = mean_squared_error(y_test, gb_pred, squared=False)

rf_rmse, gb_rmse

(0.0031273299990147146, 0.0029621480220392254)