In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from scipy.stats import pearsonr

# Load the data
df = pd.read_csv('Features/train/lexicalFeatures_train.csv')
test_df = pd.read_csv('Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X = df.drop(columns=['gs']).values
y = df['gs'].values

X_test = test_df.drop(columns=['gs']).values
y_test = test_df['gs'].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Define cross-validation strategy
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize the best model and performance trackers
best_model = None
best_pearson = -np.inf

# Perform cross-validation with different SVR parameters
params = [
    {'kernel': 'rbf', 'C': 1.0, 'epsilon': 0.1},
    {'kernel': 'rbf', 'C': 10.0, 'epsilon': 0.01},
    {'kernel': 'linear', 'C': 1.0, 'epsilon': 0.1}
]

for param in params:
    # Create SVR model with current parameters
    svr = SVR(**param)
    
    # Perform cross-validation
    cv_scores = []
    for train_idx, val_idx in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Fit the model
        svr.fit(X_train, y_train)
        
        # Predict and calculate Pearson correlation
        y_pred = svr.predict(X_val)
        corr, _ = pearsonr(y_val, y_pred)
        cv_scores.append(corr)
    
    # Calculate mean Pearson correlation
    mean_corr = np.mean(cv_scores)
    
    # Update best model if current model performs better
    if mean_corr > best_pearson:
        best_pearson = mean_corr
        best_model = SVR(**param)

# Fit the best model on entire training data
best_model.fit(X_scaled, y)

# Predict on test data
y_test_pred = best_model.predict(X_test_scaled)

# Calculate final Pearson correlation
test_corr, _ = pearsonr(y_test, y_test_pred)

print(f'Best Pearson Correlation on Validation Set: {best_pearson}')
print(f'Pearson Correlation on Testing Set: {test_corr}')
print(f'Best Model Parameters: {best_model.get_params()}')

Best Pearson Correlation on Validation Set: 0.8144895687913903
Pearson Correlation on Testing Set: 0.6811583469128275
Best Model Parameters: {'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
