In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def generate_complex_trends(n_timesteps, n_features, trend_type='mixed', randomness=0.1, seed=None):
    """
    Generate complex trends for features over time with added randomness.

    Parameters:
    - n_timesteps: int, number of time steps
    - n_features: int, number of features
    - trend_type: str, type of trend ('sin', 'exp', 'poly', or 'mixed')
    - randomness: float, level of randomness to add to the trends
    - seed: int or None, random seed for reproducibility

    Returns:
    - trends: np.ndarray, generated trends
    """
    if seed is not None:
        np.random.seed(seed)
    else:
        np.random.seed(None)  # Set a random seed if not provided

    time = np.linspace(0, 1, n_timesteps)
    trends = np.zeros((n_timesteps, n_features))

    for i in range(n_features):
        if trend_type == 'sin':
            trends[:, i] = np.sin(2 * np.pi * (i + 1) * time)
        elif trend_type == 'exp':
            trends[:, i] = np.exp(time * (i + 1))
        elif trend_type == 'poly':
            trends[:, i] = time ** (i + 1)
        elif trend_type == 'mixed':
            trends[:, i] = (np.sin(2 * np.pi * (i + 1) * time) + 
                            np.exp(time * (i + 1)) - 
                            time ** (i + 1))
        else:
            raise ValueError("Unknown trend type. Choose from 'sin', 'exp', 'poly', or 'mixed'.")
        
        trends[:, i] += randomness * np.random.randn(n_timesteps)
    
    return trends

def generate_longitudinal_data(n_samples=1000, n_timesteps=10, n_features=5, n_redundant_features=3, class_sep=1.0, trend_type='mixed', randomness=0.1, seed=None):
    """
    Generate longitudinal classification data with complex trends, time dependencies, and added randomness.

    Parameters:
    - n_samples: int, number of samples
    - n_timesteps: int, number of time steps
    - n_features: int, number of relevant features
    - n_redundant_features: int, number of redundant features
    - class_sep: float, parameter to adjust the separability of classes
    - trend_type: str, type of trend ('sin', 'exp', 'poly', or 'mixed')
    - randomness: float, level of randomness to add to the trends
    - seed: int or None, random seed for reproducibility

    Returns:
    - data: pd.DataFrame, generated data
    """
    if seed is not None:
        np.random.seed(seed)
    else:
        np.random.seed(None)

    data = []
    class_trends = {
        0: generate_complex_trends(n_timesteps, n_features, trend_type, randomness, seed),
        1: generate_complex_trends(n_timesteps, n_features, trend_type, randomness, seed)
    }

    for i in range(n_samples):
        class_label = np.random.choice([0, 1])
        
        base_signal = np.random.randn(n_features)
        
        class_offset = class_label * class_sep
        
        time_series = []

        for t in range(n_timesteps):
            trend = class_trends[class_label][t, :]
            noise = np.random.randn(n_features) * (1 - class_sep)
            time_features = base_signal + class_offset + trend + noise
            
            redundant_features = np.random.randn(n_redundant_features)
            combined_features = np.concatenate([time_features, redundant_features])
            time_series.append(combined_features)

        sample_data = pd.DataFrame(time_series, columns=[f'feature_{j}' for j in range(n_features + n_redundant_features)])
        sample_data['time'] = range(n_timesteps)
        sample_data['class'] = class_label
        sample_data['sample_id'] = i
        data.append(sample_data)

    data = pd.concat(data, ignore_index=True)    
    return data

# Visualize the dataset
def visualize_samples(data, n_samples=5):
    sample_ids = np.random.choice(data['sample_id'].unique(), n_samples, replace=False)
    for sample_id in sample_ids:
        sample_data = data[data['sample_id'] == sample_id]
        plt.figure(figsize=(12, 6))
        for feature in sample_data.columns[:-3]:
            plt.plot(sample_data['time'], sample_data[feature], label=feature)
        plt.title(f'Sample ID: {sample_id}, Class: {sample_data["class"].iloc[0]}')
        plt.xlabel('Time')
        plt.ylabel('Feature Value')
        plt.legend()
        plt.show()

def prepare_data_for_classification(data, n_timesteps=10):
    feature_cols = [col for col in data.columns if 'feature' in col]
    agg_data = data.groupby(['sample_id', 'class'])[feature_cols].agg(['mean', 'std']).reset_index()
    agg_data.columns = ['_'.join(col).strip() for col in agg_data.columns.values]
    agg_data = agg_data.rename(columns={'sample_id_': 'sample_id', 'class_': 'class'})
    return agg_data

# Extract features from the first time step
def extract_first_timestep_features(data, n_timesteps=10):
    feature_cols = [col for col in data.columns if 'feature' in col]
    first_timestep_data = data[data['time'] == 0].groupby(['sample_id', 'class'])[feature_cols].first().reset_index()
    return first_timestep_data

def split_data(agg_data, first_timestep_data):
    X_agg = agg_data.drop(columns=['sample_id', 'class'])
    y_agg = agg_data['class']
    X_train_agg, X_test_agg, y_train_agg, y_test_agg = train_test_split(X_agg, y_agg, test_size=0.2, random_state=42)

    X_first = first_timestep_data.drop(columns=['sample_id', 'class'])
    y_first = first_timestep_data['class']
    X_train_first, X_test_first, y_train_first, y_test_first = train_test_split(X_first, y_first, test_size=0.2, random_state=42)
    
    return X_train_agg, X_test_agg, y_train_agg, y_test_agg, X_train_first, X_test_first, y_train_first, y_test_first

def evaluate_classifier(X_train, X_test, y_train, y_test):
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

for class_sep in [0.1, 0.5, 1.0, 1.5]:
    print(f"\nClass Separation: {class_sep}")
    seed = np.random.randint(0, 10000)
    data = generate_longitudinal_data(n_samples=100, n_timesteps=100, n_features=10, n_redundant_features=5, class_sep=class_sep, trend_type='mixed', randomness=0.1, seed=seed)
    
    #visualize_samples(data)
    
    agg_data = prepare_data_for_classification(data)
    first_timestep_data = extract_first_timestep_features(data)
    X_train_agg, X_test_agg, y_train_agg, y_test_agg, X_train_first, X_test_first, y_train_first, y_test_first = split_data(agg_data, first_timestep_data)
    
    accuracy_agg = evaluate_classifier(X_train_agg, X_test_agg, y_train_agg, y_test_agg)
    accuracy_first = evaluate_classifier(X_train_first, X_test_first, y_train_first, y_test_first)
    
    print(f'Accuracy with aggregated features: {accuracy_agg:.2f}')
    print(f'Accuracy with first time step features: {accuracy_first:.2f}')

    for n_features in [1, 2, 3, 4, 5, 8, 10, 15]:
        X_train_subset = X_train_agg.iloc[:, :n_features*2]
        X_test_subset = X_test_agg.iloc[:, :n_features*2]
        accuracy = evaluate_classifier(X_train_subset, X_test_subset, y_train_agg, y_test_agg)
        print(f'Accuracy with {n_features} aggregated features: {accuracy:.2f}')