In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from scipy.stats import pearsonr

# Define parameter grid
param_grid = {
    'C': [1, 10],
    'kernel': ['rbf', 'linear']
}

# Custom scoring function for Pearson correlation
def pearson_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

def evaluate_model(X_train, y_train, X_test, y_test):
    # Create GridSearchCV
    grid_search = GridSearchCV(
        SVR(), 
        param_grid, 
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring=pearson_scorer,
        n_jobs=4  # Use all available cores
    )

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit the grid search
    grid_search.fit(X_scaled, y_train)

    # Best model
    best_svr = grid_search.best_estimator_

    # Predict on test set
    y_test_pred = best_svr.predict(X_test_scaled)

    # Calculate Pearson correlation
    test_corr, _ = pearsonr(y_test, y_test_pred)

    print("Best Parameters:", grid_search.best_params_)

    print(f'Best Pearson Correlation on Validation Set: {grid_search.best_score_}')
    print(f'Pearson Correlation on Testing Set: {test_corr}')

In [3]:
# Load Lexical Features
train_df_lexical = pd.read_csv('Features/train/lexicalFeatures_train.csv')
test_df_lexical = pd.read_csv('Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X_train_lexical = train_df_lexical.drop(columns=['gs']).values
y_train = train_df_lexical['gs'].values

X_test_lexical = test_df_lexical.drop(columns=['gs']).values
y_test = test_df_lexical['gs'].values

In [36]:
evaluate_model(X_train_lexical, y_train, X_test_lexical, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8144895687913902
Pearson Correlation on Testing Set: 0.6811583469128282


In [None]:
# Load Syntactic Features
train_df_syntactic = pd.read_csv('Features/train/syntacticFeatures_train.csv')
test_df_syntactic = pd.read_csv('Features/test/syntacticFeatures_test.csv')

X_train_syntactic = train_df_syntactic.drop(columns=['gs']).values
X_test_syntactic = test_df_syntactic.drop(columns=['gs']).values

In [None]:
evaluate_model(X_train_syntactic, y_train, X_test_syntactic, y_test)

In [5]:
# Load String Features
train_df_strings = pd.read_csv('Features/train/stringFeatures_train.csv')
test_df_strings = pd.read_csv('Features/test/stringFeatures_test.csv')

X_train_strings = train_df_strings.drop(columns=['gs']).values
X_test_strings = test_df_strings.drop(columns=['gs']).values

In [37]:
evaluate_model(X_train_strings, y_train, X_test_strings, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8003102544300591
Pearson Correlation on Testing Set: 0.6760961338294003


In [7]:
# Join all features
# train_df_unrestricted = pd.concat([train_df_lexical, train_df_syntactic, train_df_strings], axis=1)
# test_df_unrestricted = pd.concat([test_df_lexical, test_df_syntactic, test_df_strings], axis=1)
train_df_unrestricted = pd.concat([train_df_lexical, train_df_strings], axis=1)
test_df_unrestricted = pd.concat([test_df_lexical, test_df_strings], axis=1)

X_train_unrestricted = train_df_unrestricted.drop(columns=['gs']).values
X_test_unrestricted = test_df_unrestricted.drop(columns=['gs']).values

In [23]:
evaluate_model(X_train_unrestricted, y_train, X_test_unrestricted, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.839411848070973
Pearson Correlation on Testing Set: 0.7190683003814442


In [32]:
# Feature Selection

# Calculate correlations of all training features with the Gold Standard
correlations = {}
for column in train_df_unrestricted.columns:
    if column == 'gs':
        continue
    corr, _ = pearsonr(train_df_unrestricted[column], train_df_lexical['gs'])
    correlations[column] = corr

min_correlations = [0.3, 0.4, 0.5]

In [35]:
for min_correlation in min_correlations:
    selected_features = [k for k, v in correlations.items() if v > min_correlation]

    reduced_train_df = train_df_unrestricted[selected_features]
    reduced_test_df = test_df_unrestricted[selected_features]

    X_train_reduced = reduced_train_df.values
    X_test_reduced = reduced_test_df.values

    print(f'Evaluating model with {len(selected_features)} features (corr > {min_correlation})')
    evaluate_model(X_train_reduced, y_train, X_test_reduced, y_test)
    print()

Evaluating model with 42 features (corr > 0.3)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8451851693817535
Pearson Correlation on Testing Set: 0.735804080800094

Evaluating model with 36 features (corr > 0.4)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8431752611461757
Pearson Correlation on Testing Set: 0.7402074709511671

Evaluating model with 27 features (corr > 0.5)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8351593738807448
Pearson Correlation on Testing Set: 0.7345223266279477

