In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
from scipy.stats import pearsonr

# Load data
df = pd.read_csv('Features/train/lexicalFeatures_train.csv')
test_df = pd.read_csv('Features/test/lexicalFeatures_test.csv')

X = df.drop(columns=['gs']).values
y = df['gs'].values

X_test = test_df.drop(columns=['gs']).values
y_test = test_df['gs'].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Define model configurations
def create_model(input_dim, config):
    model = Sequential()
    model.add(Input((input_dim,)))
    
    # Add hidden layers based on configuration
    for units in config['hidden_layers']:
        model.add(Dense(units, activation='relu'))
        if config['dropout'] > 0:
            model.add(Dropout(config['dropout']))
    
    model.add(Dense(1))
    
    # Choose optimizer
    optimizer = Adam(learning_rate=config['learning_rate']) if config['optimizer'] == 'adam' else \
                RMSprop(learning_rate=config['learning_rate'])
    
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Model configurations to try
configurations = [
    {
        'hidden_layers': [128, 64],
        'dropout': 0.2,
        'learning_rate': 0.001,
        'optimizer': 'adam',
        'epochs': 50
    },
    {
        'hidden_layers': [256, 128, 64],
        'dropout': 0.3,
        'learning_rate': 0.0005,
        'optimizer': 'rmsprop',
        'epochs': 75
    },
    {
        'hidden_layers': [64],
        'dropout': 0,
        'learning_rate': 0.01,
        'optimizer': 'adam',
        'epochs': 30
    }
]

# n-fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
best_model = None
best_pearson = -np.inf

# Try different configurations
for config in configurations:
    config_pearson_scores = []
    
    for train_idx, val_idx in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Create the model for each fold
        model = create_model(X_train.shape[1], config)
        
        # Train the model
        model.fit(X_train, y_train, epochs=config['epochs'], batch_size=32, verbose=0)
        
        # Predict on the validation set
        y_pred = model.predict(X_val)
        
        # Calculate the Pearson correlation
        corr, _ = pearsonr(y_val, y_pred.flatten())
        config_pearson_scores.append(corr)
    
    # Average Pearson correlation for this configuration
    mean_corr = np.mean(config_pearson_scores)
    
    # Save the best model
    if mean_corr > best_pearson:
        best_pearson = mean_corr
        best_model = create_model(X_scaled.shape[1], config)
        best_model.fit(X_scaled, y, epochs=config['epochs'], batch_size=32, verbose=0)

# Test the best model on the separate testing data
y_test_pred = best_model.predict(X_test_scaled)
test_corr, _ = pearsonr(y_test, y_test_pred.flatten())

print(f'Best Pearson Correlation on Validation Set: {best_pearson}')
print(f'Pearson Correlation on Testing Set: {test_corr}')

# Clear the session to free memory
K.clear_session()

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3