In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

FILE_PATHS = [
    '/content/drive/MyDrive/DS With Generative AI/Machine Learning/coin_gecko_2022-03-16.csv',
    '/content/drive/MyDrive/DS With Generative AI/Machine Learning/coin_gecko_2022-03-17.csv'
]
TARGET_COLUMN = 'Liquidity_Index'
MODEL_FILENAME = 'liquidity_predictor.pkl'

def load_and_merge_data(file_paths):

    print("--- 1. Data Collection ---")
    data_frames = []
    for file in file_paths:
        try:
            df = pd.read_csv(file)
            data_frames.append(df)
            print(f"Loaded: {file}")
        except FileNotFoundError:
            print(f"ERROR: File not found: {file}")

    if not data_frames:
        raise ValueError("No data files were loaded successfully.")

    df_combined = pd.concat(data_frames, ignore_index=True)
    df_combined.sort_values(by=['coin', 'date'], inplace=True)
    print(f"Total merged data points: {len(df_combined)}")
    return df_combined

def preprocess_data(df):
    print("\n--- 2. Data Preprocessing ---")

    df.drop(columns=['coin', 'symbol', 'date'], inplace=True, errors='ignore')

    print(f"Missing values before imputation: {df.isnull().sum().sum()}")
    df.fillna(df.median(numeric_only=True), inplace=True)
    print("Missing values imputed using median.")

    # Ensure all columns are numeric after dropping non-numeric ones
    df = df.select_dtypes(include=np.number)

    return df

def feature_engineering(df):

    print("\n--- 3. Feature Engineering ---")
    df['Liquidity_Ratio'] = df['24h_volume'] / (df['mkt_cap'] + 1e-6)

    df[TARGET_COLUMN] = df['Liquidity_Ratio'] / (np.abs(df['24h']) + 0.001)

    df['Vol_Price_Ratio'] = df['24h_volume'] / (df['price'] + 1e-6)

    df.drop(columns=['Liquidity_Ratio'], inplace=True)

    print(f"Target '{TARGET_COLUMN}' and new features created.")
    return df

def prepare_for_training(df):

    X = df.drop(columns=[TARGET_COLUMN])
    y = df[TARGET_COLUMN]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, shuffle=False, random_state=42
    )

    return X_train, X_test, y_train, y_test, scaler

def train_and_evaluate_model(X_train, X_test, y_train, y_test):

    print("\n--- 4. Model Training & Evaluation ---")

    model = RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=10
    )

    model.fit(X_train, y_train)
    print("RandomForestRegressor Model Trained.")

    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("\nEvaluation Metrics:")
    print(f"  Root Mean Square Error (RMSE): {rmse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  R² Score: {r2:.4f}")

    return model

if __name__ == "__main__":
    try:

        df_raw = load_and_merge_data(FILE_PATHS)

        df_processed = preprocess_data(df_raw.copy())

        df_featured = feature_engineering(df_processed.copy())

        X_train, X_test, y_train, y_test, scaler = prepare_for_training(df_featured)
        print(f"Data Split: Train={len(X_train)}, Test={len(X_test)}")

        model = train_and_evaluate_model(X_train, X_test, y_train, y_test)

        joblib.dump(model, MODEL_FILENAME)
        joblib.dump(scaler, 'data_scaler.pkl')
        print(f"\nModel and scaler successfully saved as '{MODEL_FILENAME}' and 'data_scaler.pkl'.")

    except Exception as e:
        print(f"\nAn error occurred during the pipeline execution: {e}")

--- 1. Data Collection ---
Loaded: /content/drive/MyDrive/DS With Generative AI/Machine Learning/coin_gecko_2022-03-16.csv
Loaded: /content/drive/MyDrive/DS With Generative AI/Machine Learning/coin_gecko_2022-03-17.csv
Total merged data points: 1000

--- 2. Data Preprocessing ---
Missing values before imputation: 29
Missing values imputed using median.

--- 3. Feature Engineering ---
Target 'Liquidity_Index' and new features created.
Data Split: Train=800, Test=200

--- 4. Model Training & Evaluation ---
RandomForestRegressor Model Trained.

Evaluation Metrics:
  Root Mean Square Error (RMSE): 40.0011
  Mean Absolute Error (MAE): 10.7598
  R² Score: 0.3146

Model and scaler successfully saved as 'liquidity_predictor.pkl' and 'data_scaler.pkl'.
