In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import gzip
import os

# Load the dataset
data = pd.read_csv("out.csv")

# Select relevant features and the target variable
selected_features = ['age', 'income', 'number_transactions', 'loan_accounts', 'months_customer']
X = data[selected_features]  # Features
y = data['rfm_score']  # Target variable

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=30,          # Reduced number of trees
    max_depth=10,             # Limit tree depth
    min_samples_split=5,      # Minimum samples to split a node
    min_samples_leaf=3,       # Minimum samples per leaf node
    random_state=42
)
rf.fit(X_train_scaled, y_train)

# Evaluate model on the test set
y_pred = rf.predict(X_test_scaled)
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Save the model and scaler using pickle with gzip compression
with gzip.open('random_forest_model.pkl.gz', 'wb') as f:
    pickle.dump(rf, f)

with gzip.open('scaler.pkl.gz', 'wb') as f:
    pickle.dump(scaler, f)

# Check file sizes
rf_size = os.path.getsize('random_forest_model.pkl.gz') / (1024 * 1024)  # Convert bytes to MB
scaler_size = os.path.getsize('scaler.pkl.gz') / (1024 * 1024)  # Convert bytes to MB
print(f"Compressed Random Forest Model Size: {rf_size:.2f} MB")
print(f"Compressed Scaler Size: {scaler_size:.2f} MB")

# Ensure file sizes are within limit
if rf_size > 25 or scaler_size > 25:
    print("Warning: One or more files exceed 25 MB. Consider further optimizations.")
else:
    print("Files are within the size limit for GitHub and Render deployment.")


R² Score: 0.9773821738560391
Mean Squared Error: 1.1755806231340957
Compressed Random Forest Model Size: 0.23 MB
Compressed Scaler Size: 0.00 MB
Files are within the size limit for GitHub and Render deployment.
