In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from scipy.stats import pearsonr

# Load the data
df = pd.read_csv('Features/train/lexicalFeatures_train.csv')
test_df = pd.read_csv('Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X = df.drop(columns=['gs']).values
y = df['gs'].values

X_test = test_df.drop(columns=['gs']).values
y_test = test_df['gs'].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Define parameter grid
param_grid = {
    'C': [1, 10],
    'kernel': ['rbf', 'linear']
}

# Custom scoring function for Pearson correlation
def pearson_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return pearsonr(y, y_pred)[0]

# Create GridSearchCV
grid_search = GridSearchCV(
    SVR(), 
    param_grid, 
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring=pearson_scorer,
    n_jobs=4  # Use all available cores
)

# Fit the grid search
grid_search.fit(X_scaled, y)

# Best model
best_svr = grid_search.best_estimator_

# Predict on test set
y_test_pred = best_svr.predict(X_test_scaled)

# Calculate Pearson correlation
test_corr, _ = pearsonr(y_test, y_test_pred)

print("Best Parameters:", grid_search.best_params_)
print(f'Best Pearson Correlation on Validation Set: {grid_search.best_score_}')
print(f'Pearson Correlation on Testing Set: {test_corr}')

Best Parameters: {'C': 1, 'epsilon': 0.1, 'kernel': 'rbf'}
Best Pearson Correlation on Validation Set: 0.8144895687913903
Pearson Correlation on Testing Set: 0.6811583469128275
