# Multi-Layer Perceptron (MLP)

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K
from scipy.stats import pearsonr

# Define model configurations
def create_model(input_dim, config):
    model = Sequential()
    model.add(Input((input_dim,)))
    
    # Add hidden layers based on configuration
    for units in config['hidden_layers']:
        model.add(Dense(units, activation='relu'))
    
    model.add(Dense(1))
    
    # Choose optimizer
    optimizer = SGD(learning_rate=0.01)
    
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Model configurations to try
configurations = [
    {
        'hidden_layers': [64]
    },
    {
        'hidden_layers': [128, 64]
    },
    {
        'hidden_layers': [256, 128, 64]
    }
]

best_correlations = {}

def evaluate_model(X_train, y_train, X_test, y_test):
    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # n-fold cross-validation
    n_folds = 5
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=1)
    best_model = None
    best_pearson = -np.inf

    # Try different configurations
    for config in configurations:
        config_pearson_scores = []
        
        for train_idx, val_idx in kf.split(X_scaled):
            X_train_fold, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train_fold, y_val = y_train[train_idx], y_train[val_idx]
            
            # Create the model for each fold
            model = create_model(X_train_fold.shape[1], config)
            
            # Train the model
            model.fit(X_train_fold, y_train_fold, epochs=75, batch_size=32, verbose=0)
            
            # Predict on the validation set
            y_pred = model.predict(X_val)
            
            # Calculate the Pearson correlation
            corr, _ = pearsonr(y_val, y_pred.flatten())
            config_pearson_scores.append(corr)
        
        # Average Pearson correlation for this configuration
        mean_corr = np.mean(config_pearson_scores)
        
        # Save the best model
        if mean_corr > best_pearson:
            best_pearson = mean_corr
            best_model = model

    # Test the best model on the separate testing data
    y_test_pred = best_model.predict(X_test_scaled)
    test_corr, _ = pearsonr(y_test, y_test_pred.flatten())

    print(f'Best Model Summary:')
    best_model.summary()

    print(f'Best Pearson Correlation on Validation Set: {best_pearson}')
    print(f'Pearson Correlation on Testing Set: {test_corr}')

    return test_corr

In [7]:
# Load Lexical Features
train_df_lexical = pd.read_csv('../Features/train/lexicalFeatures_train.csv')
test_df_lexical = pd.read_csv('../Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X_train_lexical = train_df_lexical.drop(columns=['gs']).values
y_train = train_df_lexical['gs'].values

X_test_lexical = test_df_lexical.drop(columns=['gs']).values
y_test = test_df_lexical['gs'].values

In [28]:
best_correlations['Lexical'] = evaluate_model(X_train_lexical, y_train, X_test_lexical, y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━

Best Pearson Correlation on Validation Set: 0.8081974690526733
Pearson Correlation on Testing Set: 0.6068271339011042


In [None]:
# Load Syntactic Features
train_df_syntactic = pd.read_csv('../Features/train/syntacticFeatures_train.csv')
test_df_syntactic = pd.read_csv('../Features/test/syntacticFeatures_test.csv')

X_train_syntactic = train_df_syntactic.drop(columns=['gs']).values
X_test_syntactic = test_df_syntactic.drop(columns=['gs']).values

In [30]:
best_correlations['Syntactic'] = evaluate_model(X_train_syntactic, y_train, X_test_syntactic, y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━

Best Pearson Correlation on Validation Set: 0.766487042463684
Pearson Correlation on Testing Set: 0.6664096255458494


In [None]:
# Load String Features
train_df_strings = pd.read_csv('../Features/train/stringFeatures_train.csv')
test_df_strings = pd.read_csv('../Features/test/stringFeatures_test.csv')

X_train_strings = train_df_strings.drop(columns=['gs']).values
X_test_strings = test_df_strings.drop(columns=['gs']).values

In [32]:
best_correlations['Strings'] = evaluate_model(X_train_strings, y_train, X_test_strings, y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━

Best Pearson Correlation on Validation Set: 0.8039910591959079
Pearson Correlation on Testing Set: 0.6743072052517306


In [33]:
# Join all features
train_df_unrestricted = pd.concat([train_df_lexical, train_df_syntactic, train_df_strings], axis=1)
test_df_unrestricted = pd.concat([test_df_lexical, test_df_syntactic, test_df_strings], axis=1)

train_df_unrestricted = train_df_unrestricted.drop(columns=['gs'])
test_df_unrestricted = test_df_unrestricted.drop(columns=['gs'])

X_train_unrestricted = train_df_unrestricted.values
X_test_unrestricted = test_df_unrestricted.values

In [34]:
best_correlations['Unrestricted'] = evaluate_model(X_train_unrestricted, y_train, X_test_unrestricted, y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━

Best Pearson Correlation on Validation Set: 0.8188385719517612
Pearson Correlation on Testing Set: 0.6524603314293124


In [35]:
# Feature Selection

# Calculate correlations of all training features with the Gold Standard
feature_correlations = {}
for column in train_df_unrestricted.columns:
    if column == 'gs':
        continue
    corr, _ = pearsonr(train_df_unrestricted[column], train_df_lexical['gs'])
    feature_correlations[column] = corr

min_correlations = [0.3, 0.4, 0.5]

In [36]:
best_correlation = 0.0

for min_correlation in min_correlations:
    selected_features = [k for k, v in feature_correlations.items() if v > min_correlation]

    reduced_train_df = train_df_unrestricted[selected_features]
    reduced_test_df = test_df_unrestricted[selected_features]

    X_train_reduced = reduced_train_df.values
    X_test_reduced = reduced_test_df.values

    print(f'Evaluating model with {len(selected_features)} features (corr > {min_correlation})')
    correlation = evaluate_model(X_train_reduced, y_train, X_test_reduced, y_test)
    print()

    if correlation > best_correlation:
        best_correlation = correlation

best_correlations['FeatureSelection'] = best_correlation

Evaluating model with 46 features (corr > 0.3)
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

Best Pearson Correlation on Validation Set: 0.8352232358070365
Pearson Correlation on Testing Set: 0.7198778258511708

Evaluating model with 37 features (corr > 0.4)
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━

Best Pearson Correlation on Validation Set: 0.8418391748725025
Pearson Correlation on Testing Set: 0.7436574395744853

Evaluating model with 28 features (corr > 0.5)
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m14/14[0m [32m━

Best Pearson Correlation on Validation Set: 0.8343627117760064
Pearson Correlation on Testing Set: 0.7303377795369059



In [37]:
# Transform dictionary into a DataFrame
best_correlations_df = pd.DataFrame(list(best_correlations.items()), columns=["Features", "Correlation"])

best_correlations_df

Unnamed: 0,Features,Correlation
0,Lexical,0.606827
1,Syntactic,0.66641
2,Strings,0.674307
3,Unrestricted,0.65246
4,FeatureSelection,0.743657
