# The Feature Extraction Approach for Personality Score Prediction
This colab is written in **Python** to illistrate the process of *feature extraction approach* with TF-IDF scores and a random forest classifier when predicting personality scores from texts.

### **Step 1 Text Preprocessing**
In the text preprocessing phase, we 1. Removed the special characters. 2. Tokenized the texts. 3. Lowercased all texts. 4. Removed stop words.

In [None]:
# Mount Google drive to get access to the data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# import required pacakges
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
string.punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# import raw data
csv_file = '/content/drive/MyDrive/Text Selection Paper Codes/data/all_text_latent_extract_10.csv' # path to data file
df = pd.read_csv(csv_file, encoding= 'unicode_escape')


In [None]:
#defining the function to remove special characters and punctuations
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [None]:
#storing the and punctuations free text
df['clean_text']= df['All_response'].apply(lambda x:remove_punctuation(x))

In [None]:
# lowercase the texts
df['msg_lower']= df['clean_text'].apply(lambda x: x.lower())

In [None]:
#applying function to the column
df['msg_tokenied']= df['msg_lower'].apply(word_tokenize)

In [None]:
# Load the pre-defined stop words dictionary
stop_words = stopwords.words('english')
# Extend the stop words distionary with some high frequency words in the current data
stop_words.extend(["would","dont","could","id","X"])

In [None]:
# define remove stop words function
def remove_english_stopwords_func(text):
    # check in lowercase
    t = [token for token in text if token.lower() not in stop_words]
    text = ' '.join(t)
    return text

In [None]:
# remove stop words
df['No_Stop_Words'] = df['msg_tokenied'].apply(remove_english_stopwords_func)

### **Step 2 feature extraction**
In the feature extraction phase, we generate the TF-IDF vectors.

In [None]:
document = df.No_Stop_Words

In [None]:
# generate TF-IDF vectors with 5000 features
vectorizer = TfidfVectorizer(max_features=5000)
vectors = vectorizer.fit_transform(document)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

In [None]:
# add labels to TF-IDF vectors
tfidf['ascore'] = df['ascore']
tfidf['cscore'] = df['cscore']
tfidf['nscore'] = df['nscore']
tfidf['escore'] = df['escore']
tfidf['oscore'] = df['oscore']
tfidf['lead_task'] = df['task']
tfidf['lead_people'] = df['people']
tfidf['lead_char'] = df['char']
tfidf['lead_ethic'] = df['ethic']

In [None]:
# save the TF-IDF scores
tfidf.to_csv('/content/drive/MyDrive/Text Selection Paper Codes/data/tfidf_2000.csv')  # after the file has been saved, it was further splited into a training, evaluation and a testing set.

### **Step 3 Score Prediction**
In the score prediction phase, we used a random forest model to predict personality scores based on TF-IDF vectors. We used the prediction  of Extraversion scores as an example in the current code sample. Other predictions can be achieved by changing the label column.

In [None]:
# import required pacakges
from scipy.stats import pearsonr
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
# specifiy the first 2000 columns as features, and the personality scores as labels
#train_features = train_set.iloc[:,0:2000]
#test_features = test_set.iloc[:,0:2000]
#train_labels = train_set.escore
#test_labels = test_set.escore
#feature_list = list(train_features.columns)

In [None]:
tfidf_set = pd.read_csv('/content/drive/MyDrive/Text Selection Paper Codes/data/tfidf_5000.csv', encoding= 'unicode_escape')
# specifiy the first 2000 columns as features, and the personality scores as labels
X = tfidf_set.iloc[:,0:5000]
y = tfidf_set.oscore

In [None]:
# Number of folds
num_folds = 5

# Initialize KFold for the outer loop (train-test split)
kf_outer = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Parameter grid for RandomForestClassifier
param_grid = {
    'n_estimators': [200, 500, 800],
    'max_depth': [20, 50, 100],
    # Add other hyperparameters you want to tune
}
all_fold_scores = []
# Outer loop for train-test split
for fold_index, (train_index, test_index) in enumerate(kf_outer.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    task = tfidf_set.lead_task.iloc[test_index]
    people = tfidf_set.lead_people.iloc[test_index]
    char = tfidf_set.lead_char.iloc[test_index]
    ethic = tfidf_set.lead_ethic.iloc[test_index]

    # Initialize KFold for the inner loop (cross-validation within the training data)
    kf_inner = KFold(n_splits=4, shuffle=True, random_state=42)

    # Initialize RandomForestClassifier
    rf_regressor = RandomForestRegressor()

    # Define Pearson correlation as a custom scorer
    pearson_scorer = make_scorer(lambda y_true, y_pred: np.corrcoef(y_true, y_pred)[0, 1], greater_is_better=True)

    # Initialize GridSearchCV with RandomForestClassifier and the parameter grid
    grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=kf_inner, scoring=pearson_scorer)

    # Fit the model on the training data with grid search for hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters found by GridSearchCV
    best_params = grid_search.best_params_

    # Train the model with the best hyperparameters on the entire training set
    rf_regressor.set_params(**best_params)

    rf_regressor.fit(X_train, y_train)

    # Predictions on the test set
    y_pred = rf_regressor.predict(X_test)

    # save the predicted output
    result_df = pd.DataFrame({
      'fold_index': fold_index,
      'test_index': test_index,
      'y_pred': y_pred,
      'y_label': y_test
    })
    # Save the DataFrame to a CSV file
    fold_results = pd.DataFrame({'fold_index': fold_index, 'test_index': test_index, 'predicted_scores': y_pred})

    # Append the DataFrame to the list
    all_fold_scores.append(fold_results)

    # Calculate Pearson correlation on the test set (convergent validity)
    correlation, _ = pearsonr(y_pred, y_test)

    # calculate criterion validity
    criterion_task = np.corrcoef(y_pred,task)[0, 1]
    criterion_people = np.corrcoef(y_pred,people)[0, 1]
    criterion_char = np.corrcoef(y_pred,char)[0, 1]
    criterion_ethic = np.corrcoef(y_pred,ethic)[0, 1]

    # incremental validity
    #regression1 = LinearRegression()
    #regression1.fit(y_test, task)


    print(f"best parameters are:{best_params}")
    print(f"Pearson Correlation on Test Set: {correlation}")
    print(f"Criterion Correlation on task: {criterion_task}")
    print(f"Criterion Correlation on people: {criterion_people}")
    print(f"Criterion Correlation on char: {criterion_char}")
    print(f"Criterion Correlation on ethic: {criterion_ethic}")
    print("-" * 30)


In [None]:
all_results = pd.concat(all_fold_scores, ignore_index=True)

In [None]:
all_results.to_csv('/content/drive/MyDrive/Text Selection Paper Codes/final saved outputs/TFIDF/Oscore.csv')