In [19]:
import pandas as pd
import numpy as np
import json
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import re
from pathlib import Path
import os
import time
import yaml
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import VotingClassifier, BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, get_scorer_names, mean_squared_error, r2_score, mean_squared_error, roc_auc_score, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from statistics import mean
import warnings
warnings.filterwarnings('ignore')

In [15]:
training_data_path = Path('training_data')
embedding_path = Path('Movie-Script-Database/scripts/embedding')
model_selection_record_path = Path('model_selection_record')

## import training data

In [16]:
file_name = 'essays_embedding_rmpunc_rmsw'
essays_embedding = pd.read_csv(training_data_path / f'{file_name}.csv', encoding = 'cp1252') 

In [17]:
essays_embedding.head()

Unnamed: 0.1,Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,clean_text,clean_text_embedding
0,0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,right woke mid day nap sort weird moved texas ...,"[-0.0008707201923243701, 0.002650993410497904,..."
1,1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0,stream consciousness essay things like high sc...,"[0.001168464426882565, 0.0072226133197546005, ..."
2,2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1,open keyboard buttons push thing finally worke...,"[-0.00998779572546482, 0.0114965895190835, 0.0..."
3,3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0,believe happening pulse racing like mad like f...,"[-0.02526545524597168, -0.007222685497254133, ..."
4,4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1,good old stream consciousness assignment feel ...,"[0.00883069634437561, -0.004120076075196266, 0..."


## import training record

In [4]:
all_records = pd.DataFrame()

for file in model_selection_record_path.glob(f'*svc_essays_embedding*linear*50*'):
    print(f'opening {file.stem} at {time.ctime()}')

    temp = pd.read_csv(model_selection_record_path / f'{file.stem}.csv')
    temp['file'] = file.stem.split('svc_essays_embedding_')[1]
    all_records = pd.concat([all_records, temp])
    
kernel = file.stem.split('_')[-3]
gamma = file.stem.split('_')[-2]
print(kernel, gamma)

opening svc_essays_embedding_rmpunc_rmsw_cCON_linear_scale_50 at Thu Jan 11 21:43:54 2024
opening svc_essays_embedding_rmpunc_rmsw_cOPN_linear_scale_50 at Thu Jan 11 21:43:54 2024
opening svc_essays_embedding_rmpunc_rmsw_cNEU_linear_scale_50 at Thu Jan 11 21:43:54 2024
opening svc_essays_embedding_rmpunc_rmsw_cEXT_linear_scale_50 at Thu Jan 11 21:43:54 2024
opening svc_essays_embedding_rmpunc_rmsw_cAGR_linear_scale_50 at Thu Jan 11 21:43:54 2024
linear scale


In [6]:
all_records.head()

Unnamed: 0,y,kernel,gamma,C,mean_train_score,mean_test_score,score_gap,best_score,best_params,file
0,cCON,linear,scale,1.0,0.626646,0.558247,0.068399,0.558752,1.114661,rmpunc_rmsw_cCON_linear_scale_50
1,cCON,linear,scale,1.015628,0.626899,0.558247,0.068652,0.558752,1.114661,rmpunc_rmsw_cCON_linear_scale_50
2,cCON,linear,scale,1.0315,0.627153,0.556727,0.070426,0.558752,1.114661,rmpunc_rmsw_cCON_linear_scale_50
3,cCON,linear,scale,1.047621,0.627659,0.556727,0.070932,0.558752,1.114661,rmpunc_rmsw_cCON_linear_scale_50
4,cCON,linear,scale,1.063993,0.628673,0.556221,0.072452,0.558752,1.114661,rmpunc_rmsw_cCON_linear_scale_50


In [8]:
all_records = all_records.replace('cCON', 'Conscientiousness')\
                            .replace('cEXT', 'Extraversion')\
                            .replace('cNEU', 'Neuroticism')\
                            .replace('cAGR', 'Agreeableness')\
                            .replace('cOPN', 'Openness')

In [9]:
with open('the_best_params.yaml', 'r') as f:
    best_params = yaml.safe_load(f)
    
best_params

{'Agreeableness': 1.6172438571077898,
 'Conscientiousness': 1.114661130737398,
 'Extraversion': 1.830851251413804,
 'Neuroticism': 1.0975091843604738,
 'Openness': 2.105064009445578}

In [11]:
y_cols = best_params.keys()
y_cols

dict_keys(['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness'])

## predict

In [123]:
def predict_personality(y_col):

    svc = SVC(kernel = kernel,  gamma = gamma, random_state = 42, C = best_params[y_col], probability = True)
    svc.fit(X_train, y_train)
    y_proba = svc.predict_proba(X_test)[:, 1]

    return y_proba

In [21]:
movie_embedding = []
for file in embedding_path.glob(f'*embedding*'):
#     print(f'opening {file.stem} at {time.ctime()}')
    
    with open(file) as ef:
        data = json.load(ef)
        movie_name = list(data.keys())[0]

        for char in data[movie_name].keys():
            clean_line_embedding = data[movie_name][char]['clean_line_embedding']
            movie_embedding.append([movie_name, char, clean_line_embedding])
            
movie_embedding = pd.DataFrame(movie_embedding, columns=['movie_name', 'char', 'clean_line_embedding'])
print(movie_embedding.shape)

(3279, 3)


In [22]:
X_train = [
    np.array([float(n) for n in x[1:-2].split(', ')])
    for x in essays_embedding['clean_text_embedding']
]

X_test = [np.array(x) for x in movie_embedding['clean_line_embedding']]

for y_col in y_cols:
    print(f'predicting {y_col} at {time.ctime()}')
    y_train = np.array(essays_embedding[y_col])
    movie_embedding[y_col] = predict_personality(y_col)
        
movie_char_big_five_predict_result = movie_embedding.drop(columns='clean_line_embedding')
movie_char_big_five_predict_result.to_csv('movie_char_big_five_predict_result.csv', index=False)

predicting Agreeableness at Thu Jan 11 21:57:39 2024
predicting Conscientiousness at Thu Jan 11 21:57:39 2024
predicting Extraversion at Thu Jan 11 21:57:39 2024
predicting Neuroticism at Thu Jan 11 21:57:39 2024
predicting Openness at Thu Jan 11 21:57:39 2024


## more organized

In [151]:
for x in movie_char_big_five_predict_result.movie_name:
    if x.split('-')[-1] == 'The':
        print(x)

In [159]:
for rows in movie_char_big_five_predict_result.itertuples():
    if rows.movie_name.split('-')[-1] == 'The':
        movie_char_big_five_predict_result.loc[
            rows.Index, 'movie_name'] = movie_char_big_five_predict_result.loc[
                rows.Index, 'movie_name'].replace('-The', '')
        
    movie_char_big_five_predict_result.loc[
            rows.Index, 'movie_name'] = movie_char_big_five_predict_result.loc[
                rows.Index, 'movie_name'].replace('-', ' ')    
    
    movie_char_big_five_predict_result.loc[
            rows.Index, 'char'] = movie_char_big_five_predict_result.loc[
                rows.Index, 'char'].capitalize()

In [162]:
# 'Avengers The (2012)',
# 'Avventura L (The Adventure)',
# 'Majestic The (The Bijou)'
set(movie_char_big_five_predict_result.movie_name)

{'10 Things I Hate About You',
 '12 Monkeys',
 '12 Years a Slave',
 '12 and Holding',
 '127 Hours',
 '1492 Conquest of Paradise',
 '15 Minutes',
 '17 Again',
 '187',
 '2001 A Space Odyssey',
 '2012',
 '28 Days Later',
 '30 Minutes or Less',
 '42',
 '44 Inch Chest',
 '50 50',
 '500 Days of Summer',
 '8MM',
 'A Few Good Men',
 'A Most Violent Year',
 'A Prayer Before Dawn',
 'A Scanner Darkly',
 'A Serious Man',
 'Above the Law',
 'Absolute Power',
 'Abyss',
 'Adaptation',
 'Adjustment Bureau',
 'Adventures of Buckaroo Banzai Across the Eighth Dimension',
 'Affliction',
 'After School Special',
 'AfterLife',
 'Agnes of God',
 'Airplane',
 'Airplane 2 The Sequel',
 'Ali',
 'Alien 3',
 'Alien Nation',
 'Alien vs Predator',
 'Aliens',
 'All About Steve',
 'All the Kings Men',
 'All the Presidents Men',
 'Alone in the Dark',
 'Amadeus',
 'Amelia',
 'American',
 'American Beauty',
 'American Gangster',
 'American Graffiti',
 'American History X',
 'American Hustle',
 'American Milkshake',
 'A

In [176]:
movie_char_big_five_predict_result = movie_char_big_five_predict_result.sort_values(by=['movie_name', 'char'])

In [177]:
movie_char_big_five_predict_result.to_csv('movie_char_big_five_predict_result_organized.csv', index=False)