In [15]:
import datetime as dt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, BatchNormalization, Add, Activation, LSTM, Flatten, Dense, Dropout, Input, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
!wget --no-check-certificate https://github.com/allseenn/dipai/raw/main/exams_ecology.csv

--2024-08-12 20:09:38--  https://github.com/allseenn/dipai/raw/main/exams_ecology.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/allseenn/dipai/main/exams_ecology.csv [following]
--2024-08-12 20:09:38--  https://raw.githubusercontent.com/allseenn/dipai/main/exams_ecology.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1102554 (1.1M) [text/plain]
Saving to: ‘exams_ecology.csv’


2024-08-12 20:09:39 (22.4 MB/s) - ‘exams_ecology.csv’ saved [1102554/1102554]



In [6]:
# Load data
data = pd.read_csv('./exams_ecology.csv')
data['start'] = pd.to_datetime(data['start'])
data['end'] = pd.to_datetime(data['end'])
data['start'] = data['start'].apply(lambda x: int(dt.datetime.timestamp(x)))
data['end'] = data['end'].apply(lambda x: int(dt.datetime.timestamp(x)))

In [16]:
%%time
# Calculate combined ecological indices using the provided formula
ecological_columns = data.columns[3:21]
data['combined_ecological_index'] = data[ecological_columns].apply(lambda x: 1 - np.prod(x / 1000), axis=1)

# Prepare the feature set including 'global_id' as an important feature
train_columns = data.columns.difference(['stupid'])
num_features = len(train_columns)

X = data[train_columns].values
y = data['stupid'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for the model, but note it should fit the 1D convolutions and LSTMs
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Transformer block
def transformer_block(x, num_heads, ff_dim, dropout_rate):
    # Multi-Head Attention
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim, dropout=dropout_rate)(x, x)
    attention_output = Dropout(dropout_rate)(attention_output)
    out1 = Add()([x, attention_output])
    out1 = LayerNormalization(epsilon=1e-6)(out1)

    # Feed Forward Network
    ffn_output = Dense(ff_dim, activation='relu')(out1)
    ffn_output = Dense(x.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    out2 = Add()([out1, ffn_output])
    return LayerNormalization(epsilon=1e-6)(out2)

# Build a super advanced model
def build_super_advanced_model(num_features):
    inputs = Input(shape=(num_features, 1))

    # Initial Convolutional Layers
    x = Conv1D(256, kernel_size=3, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Conv1D(512, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    # Transformer Encoder Layers
    for _ in range(4):
        x = transformer_block(x, num_heads=4, ff_dim=256, dropout_rate=0.3)

    # LSTM Layers with increased units
    x = LSTM(512, return_sequences=True)(x)
    x = LSTM(256, return_sequences=True)(x)
    x = LSTM(128, return_sequences=True)(x)

    # Global Average Pooling
    x = GlobalAveragePooling1D()(x)

    # Dense layers with increased neurons and regularization
    x = Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)
    outputs = Dense(1)(x)

    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse')
    return model

# Ensemble learning: train multiple models and average predictions
def train_and_evaluate_ensemble(X_train, y_train, X_test, y_test, num_models=5):
    predictions = []
    for _ in range(num_models):
        model = build_super_advanced_model(num_features)
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        model.fit(
            X_train, y_train,
            epochs=3000,
            batch_size=16,
            validation_split=0.1,
            callbacks=[early_stopping],
            verbose=1
        )

        y_pred = model.predict(X_test)
        predictions.append(y_pred)

    # Average predictions from all models
    y_pred_ensemble = np.mean(predictions, axis=0)

    # Evaluate
    mse_tf = mean_squared_error(y_test, y_pred_ensemble)
    mae_tf = mean_absolute_error(y_test, y_pred_ensemble)

    print("Ensemble Model Evaluation:")
    print(f"Mean Squared Error (MSE): {mse_tf:.4f}")
    print(f"Mean Absolute Error (MAE): {mae_tf:.4f}")
    print(f"Coefficient of Determination (R^2): {r2_score(y_test, y_pred_ensemble):.4f}")

# Train and evaluate the ensemble
train_and_evaluate_ensemble(X_train_scaled, y_train, X_test_scaled, y_test, num_models=3)


Epoch 1/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 49ms/step - loss: 6.7981 - val_loss: 1.9342
Epoch 2/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 46ms/step - loss: 3.2227 - val_loss: 1.5756
Epoch 3/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 46ms/step - loss: 2.6297 - val_loss: 1.4874
Epoch 4/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 46ms/step - loss: 3.2206 - val_loss: 1.3087
Epoch 5/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 46ms/step - loss: 2.8766 - val_loss: 1.2843
Epoch 6/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 45ms/step - loss: 2.5954 - val_loss: 1.2799
Epoch 7/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 46ms/step - loss: 2.5051 - val_loss: 1.2676
Epoch 8/3000
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 46ms/step - loss: 2.3170 - val_loss: 1.2955
Epoch 9/