# Random Forest Regressor (RFR)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr

best_correlations = {}
feature_importances = {}

def evaluate_model(X_train, y_train, X_test, y_test):
    # Initialize Random Forest Regressor
    rf = RandomForestRegressor(random_state=1)

    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit the model on the training set
    rf.fit(X_train_scaled, y_train)

    # Predict on test set
    y_test_pred = rf.predict(X_test_scaled)

    # Calculate Pearson correlation
    test_corr, _ = pearsonr(y_test, y_test_pred)

    # Feature importance
    feature_importance = rf.feature_importances_

    print(f'Pearson Correlation on Testing Set: {test_corr}')

    return test_corr, feature_importance


In [3]:
# Load Lexical Features
train_df_lexical = pd.read_csv('../Features/train/lexicalFeatures_train.csv')
test_df_lexical = pd.read_csv('../Features/test/lexicalFeatures_test.csv')

# Prepare the features and target
X_train_lexical = train_df_lexical.drop(columns=['gs']).values
y_train = train_df_lexical['gs'].values

X_test_lexical = test_df_lexical.drop(columns=['gs']).values
y_test = test_df_lexical['gs'].values

In [4]:
lexical_results = evaluate_model(X_train_lexical, y_train, X_test_lexical, y_test)
best_correlations['Lexical'] = lexical_results[0]
feature_importances['Lexical'] = lexical_results[1]

Pearson Correlation on Testing Set: 0.7284623458322289


In [4]:
# Load Syntactic Features
train_df_syntactic = pd.read_csv('../Features/train/syntacticFeatures_train.csv')
test_df_syntactic = pd.read_csv('../Features/test/syntacticFeatures_test.csv')

X_train_syntactic = train_df_syntactic.drop(columns=['gs']).values
X_test_syntactic = test_df_syntactic.drop(columns=['gs']).values

In [76]:
syntactic_results = evaluate_model(X_train_syntactic, y_train, X_test_syntactic, y_test)
best_correlations['Syntactic'] = syntactic_results[0]
feature_importances['Syntactic'] = syntactic_results[1]

Pearson Correlation on Testing Set: 0.6607771212389034


In [5]:
# Load String Features
train_df_strings = pd.read_csv('../Features/train/stringFeatures_train.csv')
test_df_strings = pd.read_csv('../Features/test/stringFeatures_test.csv')

X_train_strings = train_df_strings.drop(columns=['gs']).values
X_test_strings = test_df_strings.drop(columns=['gs']).values

In [78]:
strings_results = evaluate_model(X_train_strings, y_train, X_test_strings, y_test)
best_correlations['Strings'] = strings_results[0]
feature_importances['Strings'] = strings_results[1]

Pearson Correlation on Testing Set: 0.6846350383049339


In [6]:
# Join all features
train_df_unrestricted = pd.concat([train_df_lexical, train_df_syntactic, train_df_strings], axis=1)
test_df_unrestricted = pd.concat([test_df_lexical, test_df_syntactic, test_df_strings], axis=1)

train_df_unrestricted = train_df_unrestricted.drop(columns=['gs'])
test_df_unrestricted = test_df_unrestricted.drop(columns=['gs'])

X_train_unrestricted = train_df_unrestricted.values
X_test_unrestricted = test_df_unrestricted.values

In [8]:
unrestricted_results = evaluate_model(X_train_unrestricted, y_train, X_test_unrestricted, y_test)
best_correlations['Unrestricted'] = unrestricted_results[0]
feature_importances['Unrestricted'] = unrestricted_results[1]

Pearson Correlation on Testing Set: 0.7570230462350941


In [14]:
# Feature Importance

# Identify important features based on a minimum importance threshold
important_features = []
min_importance = 0.01

for i in range(len(feature_importances['Unrestricted'])):
    importance = feature_importances['Unrestricted'][i]
    if importance > min_importance:
        important_features.append((i, importance))

# Sort features by importance in descending order
important_features = sorted(important_features, key=lambda x: x[1], reverse=True)

# Get column names and corresponding importances for the identified important features
important_columns = [train_df_unrestricted.columns[idx] for idx, _ in important_features]

# Display the results
print(f"There are {len(important_features)} features with importance > {min_importance}:")
print("Top 10 feature importances:")
for (idx, importance), column in zip(important_features, important_columns):
    print(f"    - {column}: {importance}")

# Select the relevant columns
importance_train_df = train_df_unrestricted[important_columns]
importance_test_df = test_df_unrestricted[important_columns]

X_train_importance = importance_train_df.values
X_test_importance = importance_test_df.values

There are 10 features with importance > 0.01:
Top 10 feature importances:
    - lemmas_wn_aug_overlap: 0.4325020377546965
    - normal_char_2gram: 0.16297134111640896
    - chunk_sim_s: 0.0412800853629427
    - lemmas_weighted_overlap: 0.02753037523823193
    - normal_char_5gram: 0.01989175950164162
    - wsd_wn_aug_overlap: 0.016492690513259033
    - wsd_lin_similarity: 0.014270287965658636
    - sw_gst_5: 0.014165231257718914
    - lemmas_resnik_similarity: 0.010444430980915709
    - wsd_resnik_similarity: 0.010332560312099703


In [13]:
importance_results = evaluate_model(X_train_importance, y_train, X_test_importance, y_test)
best_correlations['FeatureSelection'] = importance_results[0]
feature_importances['FeatureSelection'] = importance_results[1]

Pearson Correlation on Testing Set: 0.7446161791147365


In [83]:
# Transform dictionary into a DataFrame to display results
best_correlations_df = pd.DataFrame(list(best_correlations.items()), columns=["Features", "Correlation"])

best_correlations_df

Unnamed: 0,Features,Correlation
0,Lexical,0.728462
1,Syntactic,0.660777
2,Strings,0.684635
3,Unrestricted,0.757023
4,FeatureSelection,0.744606
