In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.express as px
import plotly.graph_objects as go
import re
from pathlib import Path
import os
import time
import csv
import openai
import tiktoken
from statistics import mean
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import VotingClassifier, BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, get_scorer_names, mean_squared_error, r2_score, mean_squared_error, roc_auc_score, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, KFold, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# import nltk
# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()
# words = set(nltk.corpus.words.words())



In [2]:
training_data_path = Path('training_data')
model_selection_record_path = Path('model_selection_record')

## data processing

In [4]:
essays = pd.read_csv(training_data_path / 'essays.csv', encoding='cp1252')
# mypersonality = pd.read_csv(root_path / 'mypersonality_final.csv', encoding='cp1252')

In [4]:
for col in essays.columns[2:7]:
    essays[col] = essays[col].replace('n', '0')
    essays[col] = essays[col].replace('y', '1')

In [5]:
# for col in mypersonality.columns[7:12]:
#     mypersonality[col] = mypersonality[col].replace('n', '0')
#     mypersonality[col] = mypersonality[col].replace('y', '1')

In [5]:
def clear_text(sentences, rmpunc, rmsw):
    
    clean_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub("\d+", "", sentence)
        
        # punc
        if rmpunc == True:
            sentence = re.sub(r'[^\w\s]', ' ', sentence)
            sentence = re.sub(r'\b[a-zA-Z]\b', '', sentence)
            
        # stopwords
        if rmsw == True:
            sentence = remove_stopwords(sentence)
            
        sentence = ' '.join([w for w in sentence.split(' ') if w != ''])
        clean_sentences.append(sentence)
        
    file_name = 'essays_embedding'
    if rmpunc == True:
        file_name += '_rmpunc'
    if rmsw == True:
        file_name += '_rmsw'
    
    return file_name, clean_sentences

In [6]:
file_name, essays['clean_text'] = clear_text(essays['TEXT'], True, True)

In [84]:
essays.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,clean_text
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,well right now just woke up from mid day nap i...
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0,well here we go with the stream of consciousne...
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1,an open keyboard and buttons to push the thing...
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0,can believe it it really happening my pulse is...
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1,well here go with the good old stream of consc...


In [93]:
# remove data which is too short
# essays['clean_text_length'] = [len(t.split(' ')) for t in essays.clean_text]
# essays = essays[essays.clean_text_length >= 100]

In [62]:
essays.shape

(2468, 8)

## embedding

In [65]:
openai.api_key = 'your-api-key'

In [66]:
def get_embedding(text):
    response = openai.Embedding.create(model="text-embedding-ada-002", input=text)
    return response['data'][0]['embedding']

In [81]:
essays['clean_text_embedding'] = [get_embedding(ct) for ct in essays.clean_text]
essays.to_csv(training_data_path / f'{file_name}.csv', encoding = 'cp1252', index = False) 

In [34]:
# mypersonality['clean_text_embedding'] = [get_embedding(ct) for ct in mypersonality.clean_text]
# mypersonality.to_csv(root_path / 'mypersonality_final_embedding.csv', encoding = 'cp1252') 

In [26]:
essays.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,clean_text,clean_text_embedding
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,right woke mid day nap sort weird remember sta...,"[0.0056798579171299934, 0.003075016662478447, ..."
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0,stream consciousness essay like high school pr...,"[0.005028173327445984, 0.00402528652921319, 0...."
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1,open keyboard button push thing finally worked...,"[-0.01287815161049366, 0.008358887396752834, 0..."
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0,believe happening pulse racing like mad like f...,"[-0.02485239878296852, -0.0013808832736685872,..."
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1,good old stream consciousness assignment feel ...,"[0.016445200890302658, -0.0015039060963317752,..."


## SVM classifier - model selection

In [277]:
def the_best_svc(y_col, kernel, gamma, s, e, n):
    X = [x for x in essays_embedding['clean_text_embedding']]
    y = essays_embedding[y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    test_size=0.2,
                                                    random_state=42)
    
    Cs = np.logspace(s, e, n)
   
    svc = SVC(kernel = kernel,  gamma = gamma, random_state = 42)
    param_grid = {'C': Cs}
    search = GridSearchCV(svc, param_grid,  return_train_score = True, n_jobs = -1)
    search.fit(X_train, y_train)
    
    record = pd.DataFrame()
    record['C'] = Cs
    record['mean_train_score'] = search.cv_results_['mean_train_score']
    record['mean_test_score'] = search.cv_results_['mean_test_score']
    record['score_gap'] = record['mean_train_score'] - record['mean_test_score']
    record['best_score'] = search.best_score_
    record['best_params'] = search.best_params_['C']
    record['y'] = y_col
    record['kernel'] = svc.kernel
    record['gamma'] = svc.gamma
    
    record = record.reindex(columns=['y', 'kernel', 'gamma', 'C', 'mean_train_score', 'mean_test_score',
                                     'score_gap', 'best_score', 'best_params'])
    
    best_params.update({y_col:search.best_params_['C']})
    
    record.to_csv(model_selection_record_path / f'svc_{file_name}_{y_col}_{kernel}_{gamma}_{len(Cs)}.csv', index=False)

In [242]:
file_name = 'essays_embedding_rmpunc_rmsw'
# file_name = 'essays_embedding_rmpunc'
# file_name = 'essays_embedding'
# file_name = 'essays_embedding_rmsw'

In [243]:
essays_embedding = pd.read_csv(training_data_path / f'{file_name}.csv', encoding = 'cp1252') 
# mypersonality_embedding = pd.read_csv(training_data_path / 'mypersonality_final_embedding.csv', encoding = 'cp1252') 

In [None]:
essays_embedding = pd.DataFrame()

In [244]:
essays_embedding['clean_text_embedding'] = [
    np.array([float(n) for n in x[1:-2].split(', ')])
    for x in essays_embedding['clean_text_embedding']
]

# mypersonality_embedding['clean_text_embedding'] = [
#     np.array([float(n) for n in x[1:-2].split(', ')])
#     for x in mypersonality_embedding['clean_text_embedding']
# ]

In [74]:
# data = pd.concat([
#     essays_embedding[['cEXT','clean_text_embedding']],
#     mypersonality_embedding[['cEXT','clean_text_embedding']]
# ])

In [59]:
# mypersonality_embedding['clean_text_embedding'] = [
#     [float(n) for n in x[1:-2].split(', ')]
#     for x in mypersonality_embedding['clean_text_embedding']
# ]

In [278]:
y_cols = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
# y_cols = ['cEXT']

best_params = {}

for y_col in y_cols:
    print(f'running y = {y_col}, file_name = {file_name} at {time.ctime()}')
    the_best_svc(y_col, 'linear', 'scale', 0, 0.33, 50)
    
print('done')

running y = cEXT, file_name = essays_embedding_rmpunc_rmsw at Wed Aug 30 12:14:10 2023

running y = cNEU, file_name = essays_embedding_rmpunc_rmsw at Wed Aug 30 12:19:52 2023
running y = cAGR, file_name = essays_embedding_rmpunc_rmsw at Wed Aug 30 12:25:26 2023
running y = cCON, file_name = essays_embedding_rmpunc_rmsw at Wed Aug 30 12:31:01 2023
running y = cOPN, file_name = essays_embedding_rmpunc_rmsw at Wed Aug 30 12:36:36 2023
done


## visualization

In [10]:
all_records = pd.DataFrame()
y_cols = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

for file in model_selection_record_path.glob(f'*svc_essays_embedding*linear*50*'):
    print(f'opening {file.stem} at {time.ctime()}')

    temp = pd.read_csv(model_selection_record_path / f'{file.stem}.csv')
    temp['file'] = file.stem.split('svc_essays_embedding_')[1]
    all_records = pd.concat([all_records, temp])

opening svc_essays_embedding_rmpunc_rmsw_cCON_linear_scale_50 at Thu Jan 11 21:28:06 2024
opening svc_essays_embedding_rmpunc_rmsw_cOPN_linear_scale_50 at Thu Jan 11 21:28:06 2024
opening svc_essays_embedding_rmpunc_rmsw_cNEU_linear_scale_50 at Thu Jan 11 21:28:06 2024
opening svc_essays_embedding_rmpunc_rmsw_cEXT_linear_scale_50 at Thu Jan 11 21:28:06 2024
opening svc_essays_embedding_rmpunc_rmsw_cAGR_linear_scale_50 at Thu Jan 11 21:28:06 2024


In [12]:
all_records = all_records.replace('cCON', 'Conscientiousness')\
                         .replace('cEXT', 'Extraversion')\
                         .replace('cNEU', 'Neuroticism')\
                         .replace('cAGR', 'Agreeableness')\
                         .replace('cOPN', 'Openness')

In [16]:
new_y_cols = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']
best = pd.DataFrame(columns=['y', 'best_params', 'best_score'])
for y_col in new_y_cols:
    best = pd.concat([best, all_records[all_records.y == y_col][['y', 'best_params', 'best_score']]])
    
best = best.drop_duplicates()
best

Unnamed: 0,y,best_params,best_score
0,Openness,2.105064,0.611951
0,Conscientiousness,1.114661,0.558752
0,Extraversion,1.830851,0.553697
0,Agreeableness,1.617244,0.548114
0,Neuroticism,1.097509,0.587152


In [18]:
# write into a yaml file
import yaml

best_params = {}
for rows in best.itertuples():
    best_params.update({
        rows.y: rows.best_params
    })

with open('the_best_params.yaml', 'w') as f:
    yaml.dump(best_params, f)

## visualize the record

In [26]:
colors = ['darksalmon', 'lightsteelblue', 'darkseagreen', 'navajowhite', 'indianred']
fig = go.Figure()

for n in range(0, 5):
    
    fig.add_trace(
        go.Scatter(
            x=all_records[all_records.y == new_y_cols[n]]['C'], 
            y=all_records[all_records.y == new_y_cols[n]]['mean_train_score'],
            name=f'Mean Train Score of {new_y_cols[n]}',
            mode='lines+markers',  
            marker=dict(
                symbol='circle', 
                size=5),
            line=dict(
                width=2,
                color=colors[n]
            )))
    
    fig.add_trace(
        go.Scatter(
            x=all_records[all_records.y == new_y_cols[n]]['C'], 
            y=all_records[all_records.y == new_y_cols[n]]['mean_test_score'],
            name=f'Mean Test Score of {new_y_cols[n]}',
            mode='lines+markers',  
            marker=dict(
                symbol='x', 
                size=5),
            line=dict(
                width=2, 
                color=colors[n]
            )))
    
fig.update_layout(
    title='Performance of Each Personality',
    xaxis_title='C',  # y-axis name
    yaxis_title='Accuracy',  # x-axis name
    template = 'plotly_dark',
    paper_bgcolor='black',
#     bgcolor="#494b5a"
)