In [None]:
# %% [markdown]
# # Colab Setup

# %%
# Colab environment setup
!git clone https://github.com/neurodata/SPORF.git
%cd /content/SPORF/Python
!apt-get update
!apt-get install -y build-essential cmake python3-dev libomp-dev libeigen3-dev  # Ubuntu/Debian
!python setup.py clean --all
!pip install -e .



In [None]:
# %% [markdown]
# # Imports

# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from rerf.RerF import fastPredict, fastPredictPost, fastRerF
from multiprocessing import cpu_count

In [None]:
# %% [markdown]
# # Part 1: Data Cleaning

# %%
# Load dataset
df = pd.read_csv("./datasets/social_media_vs_productivity.csv")

# %%
# Check columns
df.columns

# %%
# Preview first rows
df.head()

# %%
# Dataset info
df.info()

# %%
# Unique values for object and boolean datatypes
# To be turned into numeric values for modeling
df_columns = df.columns

for column in df_columns:
    if df[column].dtype == "object" or df[column].dtype == "bool":
        print(df[column].unique())

# %%
# Convert all unique values to numbers through mapping
for column in df_columns:
    if df[column].dtype == "object" or df[column].dtype == "bool":
        unique_values = df[column].unique()
        mapping = {value: i for i, value in enumerate(unique_values)}
        df[column] = df[column].map(mapping)

# %%
# Preview after mapping
df.head()

# %%
# Replace NaN values with 0
df.fillna(0, inplace=True)

# %%
# Preview after filling NaNs
df.head()

In [None]:
# %%
# Save cleaned dataset
df.to_csv("/content/social_media_vs_productivity_cleaned.csv")

In [None]:
# %% [markdown]
# # Part 2: Regression Using SPORF

# %%
# Load cleaned dataset
df = pd.read_csv('/content/social_media_vs_productivity_cleaned.csv')

# %%
# features that don't have anything to do with productivity, as well as the output.
input_invalid_columns = ['social_platform_preference', 'perceived_productivity_score', 'actual_productivity_score', 'Unnamed: 0']

# %%
features = df.drop(columns=input_invalid_columns)
features.head()

# %%
# Output column
output = df['actual_productivity_score']


In [None]:

# %%
# Convert to numpy arrays
input_features = features.to_numpy()
outputs = output.to_numpy()

In [None]:
# %%
# Split dataset: train/test and train/validation
X_train_full, X_test, y_train_full, y_test = train_test_split(
    input_features, outputs, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42
)

In [None]:
# %%
# Check shapes
print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_test.shape}")
print(f"y_train shape {y_train.shape}")
print(f"y_test shape {y_test.shape}")
print("Validation:")
print(f"X_val shape {X_val.shape}")
print(f"y_val shape {y_val.shape}")


In [None]:
# %%
# Train Randomer Forest
forest = fastRerF(
    X=X_train,
    Y=y_train,
    forestType="rfBase",  # regression forest
    trees=500,
    maxDepth=20,
    minParent=5,
    numCores=cpu_count()
)

forest.printParameters()

# %%
# Predictions
train_pred = fastPredict(X_train, forest)
test_pred = fastPredict(X_test, forest)

# Compute errors
train_rmse = mean_squared_error(y_train, train_pred)
test_rmse = mean_squared_error(y_test, test_pred)
train_mae = mean_absolute_error(y_train, train_pred)
test_mae = mean_absolute_error(y_test, test_pred)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train MAE:", train_mae)
print("Test MAE:", test_mae)

# %%
# Basic stats of output
print("Min:", outputs.min())
print("Max:", outputs.max())
print("Range:", outputs.max() - outputs.min())
print("Mean:", outputs.mean())
print("Std:", outputs.std())
