In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from scipy.stats import pearsonr

# Define parameter grid
param_grid = {
    'C': [1, 10],
    'kernel': ['rbf', 'linear']
}

# Custom scoring function for Pearson correlation
def pearson_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

best_correlations = {}

def evaluate_model(X_train, y_train, X_test, y_test):
    # Create GridSearchCV
    grid_search = GridSearchCV(
        SVR(), 
        param_grid, 
        cv=KFold(n_splits=5, shuffle=True, random_state=1),
        scoring=pearson_scorer,
        n_jobs=4  # Use all available cores
    )

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit the grid search
    grid_search.fit(X_scaled, y_train)

    # Best model
    best_svr = grid_search.best_estimator_

    # Predict on test set
    y_test_pred = best_svr.predict(X_test_scaled)

    # Calculate Pearson correlation
    test_corr, _ = pearsonr(y_test, y_test_pred)

    print("Best Parameters:", grid_search.best_params_)

    print(f'Best Pearson Correlation on Validation Set: {grid_search.best_score_}')
    print(f'Pearson Correlation on Testing Set: {test_corr}')

    return test_corr

In [19]:
# Load Lexical Features
train_df_lexical = pd.read_csv('Features/train/lexicalFeatures_train.csv')
test_df_lexical = pd.read_csv('Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X_train_lexical = train_df_lexical.drop(columns=['gs']).values
y_train = train_df_lexical['gs'].values

X_test_lexical = test_df_lexical.drop(columns=['gs']).values
y_test = test_df_lexical['gs'].values

In [20]:
best_correlations['Lexical'] = evaluate_model(X_train_lexical, y_train, X_test_lexical, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8148733748308142
Pearson Correlation on Testing Set: 0.6811583469128282


In [21]:
# Load Syntactic Features
train_df_syntactic = pd.read_csv('Features/train/syntacticFeatures_train.csv')
test_df_syntactic = pd.read_csv('Features/test/syntacticFeatures_test.csv')

X_train_syntactic = train_df_syntactic.drop(columns=['gs']).values
X_test_syntactic = test_df_syntactic.drop(columns=['gs']).values

In [22]:
best_correlations['Syntactic'] = evaluate_model(X_train_syntactic, y_train, X_test_syntactic, y_test)

Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.7685498799667123
Pearson Correlation on Testing Set: 0.6378577291995338


In [23]:
# Load String Features
train_df_strings = pd.read_csv('Features/train/stringFeatures_train.csv')
test_df_strings = pd.read_csv('Features/test/stringFeatures_test.csv')

X_train_strings = train_df_strings.drop(columns=['gs']).values
X_test_strings = test_df_strings.drop(columns=['gs']).values

In [24]:
best_correlations['Strings'] = evaluate_model(X_train_strings, y_train, X_test_strings, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8022661745972973
Pearson Correlation on Testing Set: 0.6760961338294003


In [25]:
# Join all features
train_df_unrestricted = pd.concat([train_df_lexical, train_df_syntactic, train_df_strings], axis=1)
test_df_unrestricted = pd.concat([test_df_lexical, test_df_syntactic, test_df_strings], axis=1)

X_train_unrestricted = train_df_unrestricted.drop(columns=['gs']).values
X_test_unrestricted = test_df_unrestricted.drop(columns=['gs']).values

In [26]:
best_correlations['Unrestricted'] = evaluate_model(X_train_unrestricted, y_train, X_test_unrestricted, y_test)

Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8480726539849996
Pearson Correlation on Testing Set: 0.7489987140897867


In [27]:
# Feature Selection

# Calculate correlations of all training features with the Gold Standard
feature_correlations = {}
for column in train_df_unrestricted.columns:
    if column == 'gs':
        continue
    corr, _ = pearsonr(train_df_unrestricted[column], train_df_lexical['gs'])
    feature_correlations[column] = corr

min_correlations = [0.3, 0.4, 0.5]

In [28]:
best_correlation = 0.0

for min_correlation in min_correlations:
    selected_features = [k for k, v in feature_correlations.items() if v > min_correlation]

    reduced_train_df = train_df_unrestricted[selected_features]
    reduced_test_df = test_df_unrestricted[selected_features]

    X_train_reduced = reduced_train_df.values
    X_test_reduced = reduced_test_df.values

    print(f'Evaluating model with {len(selected_features)} features (corr > {min_correlation})')
    correlation = evaluate_model(X_train_reduced, y_train, X_test_reduced, y_test)
    print()

    if correlation > best_correlation:
        best_correlation = correlation

best_correlations['FeatureSelection'] = best_correlation

Evaluating model with 46 features (corr > 0.3)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8476897479101991
Pearson Correlation on Testing Set: 0.7395700461339365

Evaluating model with 38 features (corr > 0.4)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8457348717800205
Pearson Correlation on Testing Set: 0.7428518518025874

Evaluating model with 28 features (corr > 0.5)
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8383692280528361
Pearson Correlation on Testing Set: 0.7364306075079398



In [None]:
# Transform dictionary into a DataFrame to display results
best_correlations_df = pd.DataFrame(list(best_correlations.items()), columns=["Features", "Correlation"])

best_correlations_df

Unnamed: 0,Features,Correlation
0,Lexical,0.681158
1,Syntactic,0.637858
2,Strings,0.676096
3,Unrestricted,0.748999
4,FeatureSelection,0.742852
