In [57]:
from datasets import Dataset
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
import torch
import pickle

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
def preprocess(examples, focused_trait="conscientiousness"):
    label = examples[focused_trait]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    examples['label'] = float(label)
    return examples



def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)    
    
    return {"mse": mse, "mae": mae, "r2": r2}



class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [6]:
traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']

# using disilroberta as a quick baseline model
# https://huggingface.co/docs/transformers/tasks/sequence_classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)


for focused_trait in traits:
    print(f"start working on f{focused_trait}")

    
    training_feature_df = pd.read_parquet("liwc_training_dataset_350words.parquet")
    ds = Dataset.from_dict(training_feature_df.to_dict('list'))
    tokenized_ds = ds.map(preprocess, fn_kwargs={"focused_trait": focused_trait})#remove_columns=['post_count','word_count','user'])
    train_test = tokenized_ds.train_test_split(test_size=0.3, seed=42)
    test_eval = train_test['test'].train_test_split(test_size=0.5, seed=42)


    LEARNING_RATE = 2e-5
    MAX_LENGTH = 256
    BATCH_SIZE = 16
    EPOCHS = 20

    training_args = TrainingArguments(
        output_dir=f"../models/fine-tuned-regression-{focused_trait}-training-text-350",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        metric_for_best_model="r2",
        load_best_model_at_end=True,
        weight_decay=0.01,
    )


    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=train_test["train"],
        eval_dataset=test_eval["train"],
        compute_metrics=compute_metrics_for_regression,
    )

    trainer.train()

    trainer.eval_dataset=test_eval["test"]
    eval_ret = trainer.evaluate()

    # save dictionary to person_data.pkl file
    with open(f'../models/fine-tuned-regression-{focused_trait}-training-text-350/eval_result.pkl', 'wb') as fp:
        pickle.dump(eval_ret, fp)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


start working on fconscientiousness


Map: 100%|██████████| 1804/1804 [00:02<00:00, 756.61 examples/s]
  5%|▌         | 79/1580 [01:21<25:33,  1.02s/it] 
  5%|▌         | 79/1580 [01:26<25:33,  1.02s/it]

{'eval_loss': 2172.096435546875, 'eval_mse': 2172.09619140625, 'eval_mae': 45.63023376464844, 'eval_r2': -23.080770548738933, 'eval_runtime': 5.1912, 'eval_samples_per_second': 52.204, 'eval_steps_per_second': 3.275, 'epoch': 1.0}


 10%|█         | 158/1580 [02:47<22:23,  1.06it/s] 
 10%|█         | 158/1580 [02:52<22:23,  1.06it/s]

{'eval_loss': 1795.1483154296875, 'eval_mse': 1795.1484375, 'eval_mae': 41.293212890625, 'eval_r2': -18.90176805904045, 'eval_runtime': 5.0918, 'eval_samples_per_second': 53.223, 'eval_steps_per_second': 3.339, 'epoch': 2.0}


 15%|█▌        | 237/1580 [04:12<21:08,  1.06it/s]  
 15%|█▌        | 237/1580 [04:17<21:08,  1.06it/s]

{'eval_loss': 1441.5977783203125, 'eval_mse': 1441.5977783203125, 'eval_mae': 36.76367950439453, 'eval_r2': -14.982156577835157, 'eval_runtime': 5.0879, 'eval_samples_per_second': 53.264, 'eval_steps_per_second': 3.341, 'epoch': 3.0}


 20%|██        | 316/1580 [05:36<19:40,  1.07it/s]  
 20%|██        | 316/1580 [05:41<19:40,  1.07it/s]

{'eval_loss': 1115.658935546875, 'eval_mse': 1115.6590576171875, 'eval_mae': 32.02546691894531, 'eval_r2': -11.36866366554671, 'eval_runtime': 5.0531, 'eval_samples_per_second': 53.63, 'eval_steps_per_second': 3.364, 'epoch': 4.0}


 25%|██▌       | 395/1580 [07:01<18:21,  1.08it/s]  
 25%|██▌       | 395/1580 [07:06<18:21,  1.08it/s]

{'eval_loss': 823.2000732421875, 'eval_mse': 823.2000732421875, 'eval_mae': 27.076873779296875, 'eval_r2': -8.12634128170828, 'eval_runtime': 5.1759, 'eval_samples_per_second': 52.359, 'eval_steps_per_second': 3.284, 'epoch': 5.0}


 30%|███       | 474/1580 [08:26<17:59,  1.02it/s]  
 30%|███       | 474/1580 [08:31<17:59,  1.02it/s]

{'eval_loss': 580.0154418945312, 'eval_mse': 580.0154418945312, 'eval_mae': 22.208711624145508, 'eval_r2': -5.430294088749579, 'eval_runtime': 5.0781, 'eval_samples_per_second': 53.367, 'eval_steps_per_second': 3.348, 'epoch': 6.0}


 32%|███▏      | 500/1580 [08:59<17:37,  1.02it/s]

{'loss': 1449.8055, 'learning_rate': 1.3670886075949368e-05, 'epoch': 6.33}


 35%|███▌      | 553/1580 [09:51<16:15,  1.05it/s]
 35%|███▌      | 553/1580 [09:57<16:15,  1.05it/s]

{'eval_loss': 388.0548400878906, 'eval_mse': 388.0548400878906, 'eval_mae': 17.950153350830078, 'eval_r2': -3.3021385865743387, 'eval_runtime': 5.0781, 'eval_samples_per_second': 53.367, 'eval_steps_per_second': 3.348, 'epoch': 7.0}


 40%|████      | 632/1580 [11:16<15:24,  1.03it/s]
 40%|████      | 632/1580 [11:22<15:24,  1.03it/s]

{'eval_loss': 246.49546813964844, 'eval_mse': 246.4954376220703, 'eval_mae': 14.129340171813965, 'eval_r2': -1.7327520123971247, 'eval_runtime': 5.0867, 'eval_samples_per_second': 53.276, 'eval_steps_per_second': 3.342, 'epoch': 8.0}


 45%|████▌     | 711/1580 [12:41<13:34,  1.07it/s]
 45%|████▌     | 711/1580 [12:46<13:34,  1.07it/s]

{'eval_loss': 152.7140350341797, 'eval_mse': 152.71405029296875, 'eval_mae': 10.696043014526367, 'eval_r2': -0.6930519018463375, 'eval_runtime': 5.0547, 'eval_samples_per_second': 53.614, 'eval_steps_per_second': 3.363, 'epoch': 9.0}


 50%|█████     | 790/1580 [14:05<12:12,  1.08it/s]
 50%|█████     | 790/1580 [14:10<12:12,  1.08it/s]

{'eval_loss': 85.9196548461914, 'eval_mse': 85.91963958740234, 'eval_mae': 7.49049186706543, 'eval_r2': 0.04745877365438789, 'eval_runtime': 5.0806, 'eval_samples_per_second': 53.34, 'eval_steps_per_second': 3.346, 'epoch': 10.0}


 55%|█████▌    | 869/1580 [15:31<11:07,  1.06it/s]
 55%|█████▌    | 869/1580 [15:36<11:07,  1.06it/s]

{'eval_loss': 55.11273193359375, 'eval_mse': 55.112728118896484, 'eval_mae': 5.837935447692871, 'eval_r2': 0.38899719224715046, 'eval_runtime': 5.1286, 'eval_samples_per_second': 52.84, 'eval_steps_per_second': 3.315, 'epoch': 11.0}


 60%|██████    | 948/1580 [16:56<10:21,  1.02it/s]
 60%|██████    | 948/1580 [17:01<10:21,  1.02it/s]

{'eval_loss': 44.86359786987305, 'eval_mse': 44.86359786987305, 'eval_mae': 5.1784772872924805, 'eval_r2': 0.5026233352610671, 'eval_runtime': 5.0598, 'eval_samples_per_second': 53.559, 'eval_steps_per_second': 3.36, 'epoch': 12.0}


 63%|██████▎   | 1000/1580 [17:55<09:42,  1.00s/it]

{'loss': 178.4975, 'learning_rate': 7.341772151898735e-06, 'epoch': 12.66}


 65%|██████▌   | 1027/1580 [18:21<08:46,  1.05it/s]
 65%|██████▌   | 1027/1580 [18:26<08:46,  1.05it/s]

{'eval_loss': 44.42943572998047, 'eval_mse': 44.42942810058594, 'eval_mae': 5.225341320037842, 'eval_r2': 0.5074367090603555, 'eval_runtime': 5.0787, 'eval_samples_per_second': 53.361, 'eval_steps_per_second': 3.347, 'epoch': 13.0}


 70%|███████   | 1106/1580 [19:47<07:48,  1.01it/s]
 70%|███████   | 1106/1580 [19:52<07:48,  1.01it/s]

{'eval_loss': 34.15024185180664, 'eval_mse': 34.15024185180664, 'eval_mae': 4.455232620239258, 'eval_r2': 0.6213960690563576, 'eval_runtime': 5.0288, 'eval_samples_per_second': 53.89, 'eval_steps_per_second': 3.381, 'epoch': 14.0}


 75%|███████▌  | 1185/1580 [21:09<05:56,  1.11it/s]
 75%|███████▌  | 1185/1580 [21:14<05:56,  1.11it/s]

{'eval_loss': 32.23193359375, 'eval_mse': 32.23193359375, 'eval_mae': 4.370184898376465, 'eval_r2': 0.6426632605010975, 'eval_runtime': 5.0083, 'eval_samples_per_second': 54.11, 'eval_steps_per_second': 3.394, 'epoch': 15.0}


 80%|████████  | 1264/1580 [22:30<04:43,  1.11it/s]
 80%|████████  | 1264/1580 [22:35<04:43,  1.11it/s]

{'eval_loss': 30.633792877197266, 'eval_mse': 30.633792877197266, 'eval_mae': 4.301383972167969, 'eval_r2': 0.6603809445723332, 'eval_runtime': 4.993, 'eval_samples_per_second': 54.276, 'eval_steps_per_second': 3.405, 'epoch': 16.0}


 85%|████████▌ | 1343/1580 [23:55<03:59,  1.01s/it]
 85%|████████▌ | 1343/1580 [24:01<03:59,  1.01s/it]

{'eval_loss': 30.124427795410156, 'eval_mse': 30.12442970275879, 'eval_mae': 4.2514262199401855, 'eval_r2': 0.666027942458806, 'eval_runtime': 5.0814, 'eval_samples_per_second': 53.331, 'eval_steps_per_second': 3.346, 'epoch': 17.0}


 90%|█████████ | 1422/1580 [25:21<02:36,  1.01it/s]
 90%|█████████ | 1422/1580 [25:26<02:36,  1.01it/s]

{'eval_loss': 29.3746395111084, 'eval_mse': 29.3746395111084, 'eval_mae': 4.226375102996826, 'eval_r2': 0.6743404370820235, 'eval_runtime': 5.058, 'eval_samples_per_second': 53.579, 'eval_steps_per_second': 3.361, 'epoch': 18.0}


 95%|█████████▍| 1500/1580 [26:45<01:21,  1.02s/it]

{'loss': 26.7343, 'learning_rate': 1.0126582278481013e-06, 'epoch': 18.99}


 95%|█████████▌| 1501/1580 [26:45<01:18,  1.00it/s]
 95%|█████████▌| 1501/1580 [26:51<01:18,  1.00it/s]

{'eval_loss': 29.66636085510254, 'eval_mse': 29.66636085510254, 'eval_mae': 4.2612128257751465, 'eval_r2': 0.6711062645047519, 'eval_runtime': 5.0816, 'eval_samples_per_second': 53.33, 'eval_steps_per_second': 3.345, 'epoch': 19.0}


100%|██████████| 1580/1580 [28:10<00:00,  1.02s/it]
100%|██████████| 1580/1580 [28:16<00:00,  1.02s/it]

{'eval_loss': 29.271371841430664, 'eval_mse': 29.27137565612793, 'eval_mae': 4.232879638671875, 'eval_r2': 0.6754852565027011, 'eval_runtime': 5.0717, 'eval_samples_per_second': 53.433, 'eval_steps_per_second': 3.352, 'epoch': 20.0}


100%|██████████| 1580/1580 [28:17<00:00,  1.07s/it]


{'train_runtime': 1697.446, 'train_samples_per_second': 14.869, 'train_steps_per_second': 0.931, 'train_loss': 524.5957484764389, 'epoch': 20.0}


100%|██████████| 17/17 [00:04<00:00,  3.56it/s]


start working on fopenness


Map: 100%|██████████| 1804/1804 [00:02<00:00, 756.09 examples/s]
  5%|▌         | 79/1580 [01:20<24:19,  1.03it/s] 
  5%|▌         | 79/1580 [01:25<24:19,  1.03it/s]

{'eval_loss': 35.65291213989258, 'eval_mse': 35.65291213989258, 'eval_mae': 4.892955780029297, 'eval_r2': 0.01922750623351932, 'eval_runtime': 5.1013, 'eval_samples_per_second': 53.123, 'eval_steps_per_second': 3.332, 'epoch': 1.0}


 10%|█         | 158/1580 [02:47<24:42,  1.04s/it] 
 10%|█         | 158/1580 [02:52<24:42,  1.04s/it]

{'eval_loss': 17.5617618560791, 'eval_mse': 17.561758041381836, 'eval_mae': 3.260486364364624, 'eval_r2': 0.516895297801152, 'eval_runtime': 5.6007, 'eval_samples_per_second': 48.387, 'eval_steps_per_second': 3.035, 'epoch': 2.0}


 15%|█▌        | 237/1580 [04:17<21:52,  1.02it/s]  
 15%|█▌        | 237/1580 [04:23<21:52,  1.02it/s]

{'eval_loss': 18.47286605834961, 'eval_mse': 18.472862243652344, 'eval_mae': 3.42873215675354, 'eval_r2': 0.4918318058172545, 'eval_runtime': 5.0862, 'eval_samples_per_second': 53.282, 'eval_steps_per_second': 3.342, 'epoch': 3.0}


 20%|██        | 316/1580 [05:43<20:09,  1.04it/s]  
 20%|██        | 316/1580 [05:48<20:09,  1.04it/s]

{'eval_loss': 14.776266098022461, 'eval_mse': 14.776266098022461, 'eval_mae': 3.064649820327759, 'eval_r2': 0.5935211199150436, 'eval_runtime': 5.0503, 'eval_samples_per_second': 53.66, 'eval_steps_per_second': 3.366, 'epoch': 4.0}


 25%|██▌       | 395/1580 [07:08<19:01,  1.04it/s]  
 25%|██▌       | 395/1580 [07:13<19:01,  1.04it/s]

{'eval_loss': 14.902702331542969, 'eval_mse': 14.902702331542969, 'eval_mae': 3.1294126510620117, 'eval_r2': 0.5900430013747971, 'eval_runtime': 5.0661, 'eval_samples_per_second': 53.493, 'eval_steps_per_second': 3.356, 'epoch': 5.0}


 30%|███       | 474/1580 [08:32<17:40,  1.04it/s]
 30%|███       | 474/1580 [08:37<17:40,  1.04it/s]

{'eval_loss': 15.897226333618164, 'eval_mse': 15.897226333618164, 'eval_mae': 3.2061946392059326, 'eval_r2': 0.5626847288402312, 'eval_runtime': 5.0201, 'eval_samples_per_second': 53.983, 'eval_steps_per_second': 3.386, 'epoch': 6.0}


 32%|███▏      | 500/1580 [09:05<17:39,  1.02it/s]

{'loss': 17.9527, 'learning_rate': 1.3670886075949368e-05, 'epoch': 6.33}


 35%|███▌      | 553/1580 [09:58<16:46,  1.02it/s]
 35%|███▌      | 553/1580 [10:03<16:46,  1.02it/s]

{'eval_loss': 14.717012405395508, 'eval_mse': 14.717013359069824, 'eval_mae': 3.0796070098876953, 'eval_r2': 0.5951511369223602, 'eval_runtime': 5.0249, 'eval_samples_per_second': 53.932, 'eval_steps_per_second': 3.383, 'epoch': 7.0}


 40%|████      | 632/1580 [11:23<15:11,  1.04it/s]
 40%|████      | 632/1580 [11:28<15:11,  1.04it/s]

{'eval_loss': 14.408207893371582, 'eval_mse': 14.408207893371582, 'eval_mae': 3.087844133377075, 'eval_r2': 0.6036460056316852, 'eval_runtime': 5.0475, 'eval_samples_per_second': 53.69, 'eval_steps_per_second': 3.368, 'epoch': 8.0}


 45%|████▌     | 711/1580 [12:48<14:15,  1.02it/s]
 45%|████▌     | 711/1580 [12:53<14:15,  1.02it/s]

{'eval_loss': 13.79648494720459, 'eval_mse': 13.79648494720459, 'eval_mae': 2.994239091873169, 'eval_r2': 0.6204738381462976, 'eval_runtime': 5.0672, 'eval_samples_per_second': 53.481, 'eval_steps_per_second': 3.355, 'epoch': 9.0}


 50%|█████     | 790/1580 [14:20<13:46,  1.05s/it]
 50%|█████     | 790/1580 [14:26<13:46,  1.05s/it]

{'eval_loss': 14.360794067382812, 'eval_mse': 14.360795021057129, 'eval_mae': 3.0599372386932373, 'eval_r2': 0.6049502959171937, 'eval_runtime': 5.5895, 'eval_samples_per_second': 48.484, 'eval_steps_per_second': 3.041, 'epoch': 10.0}


 55%|█████▌    | 869/1580 [15:52<11:39,  1.02it/s]
 55%|█████▌    | 869/1580 [15:58<11:39,  1.02it/s]

{'eval_loss': 15.432473182678223, 'eval_mse': 15.432473182678223, 'eval_mae': 3.158289670944214, 'eval_r2': 0.5754696167942395, 'eval_runtime': 5.1814, 'eval_samples_per_second': 52.302, 'eval_steps_per_second': 3.281, 'epoch': 11.0}


 60%|██████    | 948/1580 [17:20<10:44,  1.02s/it]
 60%|██████    | 948/1580 [17:25<10:44,  1.02s/it]

{'eval_loss': 15.729812622070312, 'eval_mse': 15.72981071472168, 'eval_mae': 3.2121992111206055, 'eval_r2': 0.5672901534734571, 'eval_runtime': 5.1118, 'eval_samples_per_second': 53.014, 'eval_steps_per_second': 3.326, 'epoch': 12.0}


 63%|██████▎   | 1000/1580 [18:23<09:44,  1.01s/it]

{'loss': 4.8948, 'learning_rate': 7.341772151898735e-06, 'epoch': 12.66}


 65%|██████▌   | 1027/1580 [18:50<08:46,  1.05it/s]
 65%|██████▌   | 1027/1580 [18:55<08:46,  1.05it/s]

{'eval_loss': 14.385638236999512, 'eval_mse': 14.385637283325195, 'eval_mae': 3.067190647125244, 'eval_r2': 0.6042668874527017, 'eval_runtime': 5.1012, 'eval_samples_per_second': 53.125, 'eval_steps_per_second': 3.333, 'epoch': 13.0}


 70%|███████   | 1106/1580 [20:15<07:29,  1.05it/s]
 70%|███████   | 1106/1580 [20:20<07:29,  1.05it/s]

{'eval_loss': 13.92337417602539, 'eval_mse': 13.92337417602539, 'eval_mae': 2.994760751724243, 'eval_r2': 0.6169832727905227, 'eval_runtime': 5.0376, 'eval_samples_per_second': 53.796, 'eval_steps_per_second': 3.375, 'epoch': 14.0}


 75%|███████▌  | 1185/1580 [21:40<06:33,  1.00it/s]
 75%|███████▌  | 1185/1580 [21:45<06:33,  1.00it/s]

{'eval_loss': 13.882478713989258, 'eval_mse': 13.882477760314941, 'eval_mae': 2.9846689701080322, 'eval_r2': 0.6181082744344624, 'eval_runtime': 5.1083, 'eval_samples_per_second': 53.051, 'eval_steps_per_second': 3.328, 'epoch': 15.0}


 80%|████████  | 1264/1580 [23:09<05:33,  1.06s/it]
 80%|████████  | 1264/1580 [23:15<05:33,  1.06s/it]

{'eval_loss': 14.004043579101562, 'eval_mse': 14.004042625427246, 'eval_mae': 3.020711898803711, 'eval_r2': 0.6147641654369796, 'eval_runtime': 5.3837, 'eval_samples_per_second': 50.337, 'eval_steps_per_second': 3.158, 'epoch': 16.0}


 85%|████████▌ | 1343/1580 [24:36<03:53,  1.02it/s]
 85%|████████▌ | 1343/1580 [24:41<03:53,  1.02it/s]

{'eval_loss': 13.787270545959473, 'eval_mse': 13.787269592285156, 'eval_mae': 2.999082088470459, 'eval_r2': 0.6207273207834152, 'eval_runtime': 5.0832, 'eval_samples_per_second': 53.313, 'eval_steps_per_second': 3.344, 'epoch': 17.0}


 90%|█████████ | 1422/1580 [26:06<02:36,  1.01it/s]
 90%|█████████ | 1422/1580 [26:11<02:36,  1.01it/s]

{'eval_loss': 14.36164665222168, 'eval_mse': 14.361645698547363, 'eval_mae': 3.0614120960235596, 'eval_r2': 0.604926871078717, 'eval_runtime': 5.1821, 'eval_samples_per_second': 52.295, 'eval_steps_per_second': 3.281, 'epoch': 18.0}


 95%|█████████▍| 1500/1580 [27:33<01:20,  1.01s/it]

{'loss': 3.043, 'learning_rate': 1.0126582278481013e-06, 'epoch': 18.99}


 95%|█████████▌| 1501/1580 [27:34<01:17,  1.01it/s]
 95%|█████████▌| 1501/1580 [27:40<01:17,  1.01it/s]

{'eval_loss': 13.815346717834473, 'eval_mse': 13.815346717834473, 'eval_mae': 2.9978933334350586, 'eval_r2': 0.6199549779911773, 'eval_runtime': 5.2051, 'eval_samples_per_second': 52.065, 'eval_steps_per_second': 3.266, 'epoch': 19.0}


100%|██████████| 1580/1580 [29:02<00:00,  1.03it/s]
100%|██████████| 1580/1580 [29:07<00:00,  1.03it/s]

{'eval_loss': 14.051982879638672, 'eval_mse': 14.051981925964355, 'eval_mae': 3.030395030975342, 'eval_r2': 0.6134453940858648, 'eval_runtime': 5.1848, 'eval_samples_per_second': 52.268, 'eval_steps_per_second': 3.279, 'epoch': 20.0}


100%|██████████| 1580/1580 [29:09<00:00,  1.11s/it]


{'train_runtime': 1749.3671, 'train_samples_per_second': 14.428, 'train_steps_per_second': 0.903, 'train_loss': 8.321989363658277, 'epoch': 20.0}


100%|██████████| 17/17 [00:04<00:00,  3.48it/s]


start working on fneuroticism


Map: 100%|██████████| 1804/1804 [00:02<00:00, 724.31 examples/s]
  5%|▌         | 79/1580 [01:22<25:51,  1.03s/it] 
  5%|▌         | 79/1580 [01:27<25:51,  1.03s/it]

{'eval_loss': 47.670166015625, 'eval_mse': 47.670162200927734, 'eval_mae': 5.185153007507324, 'eval_r2': 0.3484757289203857, 'eval_runtime': 5.1963, 'eval_samples_per_second': 52.153, 'eval_steps_per_second': 3.272, 'epoch': 1.0}


 10%|█         | 158/1580 [02:50<23:53,  1.01s/it] 
 10%|█         | 158/1580 [02:55<23:53,  1.01s/it]

{'eval_loss': 40.17042541503906, 'eval_mse': 40.17042922973633, 'eval_mae': 4.861671447753906, 'eval_r2': 0.4509771487179408, 'eval_runtime': 5.1783, 'eval_samples_per_second': 52.334, 'eval_steps_per_second': 3.283, 'epoch': 2.0}


 15%|█▌        | 237/1580 [04:17<22:33,  1.01s/it]  
 15%|█▌        | 237/1580 [04:23<22:33,  1.01s/it]

{'eval_loss': 30.715272903442383, 'eval_mse': 30.71527671813965, 'eval_mae': 4.300620079040527, 'eval_r2': 0.5802039690904927, 'eval_runtime': 5.1968, 'eval_samples_per_second': 52.147, 'eval_steps_per_second': 3.271, 'epoch': 3.0}


 20%|██        | 316/1580 [05:45<21:02,  1.00it/s]  
 20%|██        | 316/1580 [05:51<21:02,  1.00it/s]

{'eval_loss': 29.794424057006836, 'eval_mse': 29.794424057006836, 'eval_mae': 4.114986896514893, 'eval_r2': 0.5927895304996431, 'eval_runtime': 5.2132, 'eval_samples_per_second': 51.983, 'eval_steps_per_second': 3.261, 'epoch': 4.0}


 25%|██▌       | 395/1580 [07:13<19:32,  1.01it/s]  
 25%|██▌       | 395/1580 [07:18<19:32,  1.01it/s]

{'eval_loss': 26.994029998779297, 'eval_mse': 26.994029998779297, 'eval_mae': 4.04832649230957, 'eval_r2': 0.6310634734890493, 'eval_runtime': 5.1199, 'eval_samples_per_second': 52.931, 'eval_steps_per_second': 3.32, 'epoch': 5.0}


 30%|███       | 474/1580 [08:35<17:24,  1.06it/s]
 30%|███       | 474/1580 [08:40<17:24,  1.06it/s]

{'eval_loss': 28.797073364257812, 'eval_mse': 28.797073364257812, 'eval_mae': 4.09811544418335, 'eval_r2': 0.6064206796237026, 'eval_runtime': 5.0237, 'eval_samples_per_second': 53.945, 'eval_steps_per_second': 3.384, 'epoch': 6.0}


 32%|███▏      | 500/1580 [09:06<16:55,  1.06it/s]

{'loss': 24.0345, 'learning_rate': 1.3670886075949368e-05, 'epoch': 6.33}


 35%|███▌      | 553/1580 [09:56<16:45,  1.02it/s]
 35%|███▌      | 553/1580 [10:01<16:45,  1.02it/s]

{'eval_loss': 32.478912353515625, 'eval_mse': 32.478912353515625, 'eval_mae': 4.385518550872803, 'eval_r2': 0.5560997169910968, 'eval_runtime': 5.0483, 'eval_samples_per_second': 53.682, 'eval_steps_per_second': 3.368, 'epoch': 7.0}


 40%|████      | 632/1580 [11:17<14:16,  1.11it/s]
 40%|████      | 632/1580 [11:22<14:16,  1.11it/s]

{'eval_loss': 27.400836944580078, 'eval_mse': 27.400836944580078, 'eval_mae': 3.9547832012176514, 'eval_r2': 0.6255035045407571, 'eval_runtime': 5.1079, 'eval_samples_per_second': 53.055, 'eval_steps_per_second': 3.328, 'epoch': 8.0}


 45%|████▌     | 711/1580 [12:38<13:00,  1.11it/s]
 45%|████▌     | 711/1580 [12:43<13:00,  1.11it/s]

{'eval_loss': 25.785564422607422, 'eval_mse': 25.785566329956055, 'eval_mae': 3.9133589267730713, 'eval_r2': 0.6475799514186513, 'eval_runtime': 5.031, 'eval_samples_per_second': 53.866, 'eval_steps_per_second': 3.379, 'epoch': 9.0}


 50%|█████     | 790/1580 [13:59<11:54,  1.11it/s]
 50%|█████     | 790/1580 [14:04<11:54,  1.11it/s]

{'eval_loss': 30.573314666748047, 'eval_mse': 30.573318481445312, 'eval_mae': 4.221816539764404, 'eval_r2': 0.582144170410194, 'eval_runtime': 5.0591, 'eval_samples_per_second': 53.566, 'eval_steps_per_second': 3.36, 'epoch': 10.0}


 55%|█████▌    | 869/1580 [15:20<10:39,  1.11it/s]
 55%|█████▌    | 869/1580 [15:25<10:39,  1.11it/s]

{'eval_loss': 28.551176071166992, 'eval_mse': 28.551176071166992, 'eval_mae': 4.270818710327148, 'eval_r2': 0.609781448116979, 'eval_runtime': 4.9862, 'eval_samples_per_second': 54.35, 'eval_steps_per_second': 3.409, 'epoch': 11.0}


 60%|██████    | 948/1580 [16:42<09:26,  1.12it/s]
 60%|██████    | 948/1580 [16:47<09:26,  1.12it/s]

{'eval_loss': 26.2995662689209, 'eval_mse': 26.2995662689209, 'eval_mae': 4.005341053009033, 'eval_r2': 0.6405549608469588, 'eval_runtime': 4.9943, 'eval_samples_per_second': 54.261, 'eval_steps_per_second': 3.404, 'epoch': 12.0}


 63%|██████▎   | 1000/1580 [17:37<09:00,  1.07it/s]

{'loss': 4.8704, 'learning_rate': 7.341772151898735e-06, 'epoch': 12.66}


 65%|██████▌   | 1027/1580 [18:02<08:18,  1.11it/s]
 65%|██████▌   | 1027/1580 [18:07<08:18,  1.11it/s]

{'eval_loss': 26.725996017456055, 'eval_mse': 26.725997924804688, 'eval_mae': 4.077067852020264, 'eval_r2': 0.6347267569085865, 'eval_runtime': 4.986, 'eval_samples_per_second': 54.352, 'eval_steps_per_second': 3.41, 'epoch': 13.0}


 70%|███████   | 1106/1580 [19:23<07:06,  1.11it/s]
 70%|███████   | 1106/1580 [19:28<07:06,  1.11it/s]

{'eval_loss': 27.180797576904297, 'eval_mse': 27.180797576904297, 'eval_mae': 4.035276889801025, 'eval_r2': 0.6285108724569155, 'eval_runtime': 5.032, 'eval_samples_per_second': 53.856, 'eval_steps_per_second': 3.378, 'epoch': 14.0}


 75%|███████▌  | 1185/1580 [20:44<05:56,  1.11it/s]
 75%|███████▌  | 1185/1580 [20:49<05:56,  1.11it/s]

{'eval_loss': 26.990859985351562, 'eval_mse': 26.990861892700195, 'eval_mae': 4.00455379486084, 'eval_r2': 0.6311068025984536, 'eval_runtime': 5.0545, 'eval_samples_per_second': 53.616, 'eval_steps_per_second': 3.363, 'epoch': 15.0}


 80%|████████  | 1264/1580 [22:05<04:46,  1.10it/s]
 80%|████████  | 1264/1580 [22:10<04:46,  1.10it/s]

{'eval_loss': 28.49349594116211, 'eval_mse': 28.49349594116211, 'eval_mae': 4.078741073608398, 'eval_r2': 0.6105697927047318, 'eval_runtime': 5.0085, 'eval_samples_per_second': 54.108, 'eval_steps_per_second': 3.394, 'epoch': 16.0}


 85%|████████▌ | 1343/1580 [23:26<03:37,  1.09it/s]
 85%|████████▌ | 1343/1580 [23:31<03:37,  1.09it/s]

{'eval_loss': 26.91554069519043, 'eval_mse': 26.915542602539062, 'eval_mae': 4.014644145965576, 'eval_r2': 0.6321362013353131, 'eval_runtime': 5.0015, 'eval_samples_per_second': 54.183, 'eval_steps_per_second': 3.399, 'epoch': 17.0}


 90%|█████████ | 1422/1580 [24:47<02:27,  1.07it/s]
 90%|█████████ | 1422/1580 [24:52<02:27,  1.07it/s]

{'eval_loss': 27.080747604370117, 'eval_mse': 27.080747604370117, 'eval_mae': 4.006688117980957, 'eval_r2': 0.6298782511340035, 'eval_runtime': 5.0134, 'eval_samples_per_second': 54.055, 'eval_steps_per_second': 3.391, 'epoch': 18.0}


 95%|█████████▍| 1500/1580 [26:07<01:16,  1.04it/s]

{'loss': 2.3563, 'learning_rate': 1.0126582278481013e-06, 'epoch': 18.99}


 95%|█████████▌| 1501/1580 [26:08<01:13,  1.07it/s]
 95%|█████████▌| 1501/1580 [26:13<01:13,  1.07it/s]

{'eval_loss': 27.04754066467285, 'eval_mse': 27.047542572021484, 'eval_mae': 4.002447605133057, 'eval_r2': 0.6303321208708141, 'eval_runtime': 5.0967, 'eval_samples_per_second': 53.171, 'eval_steps_per_second': 3.335, 'epoch': 19.0}


100%|██████████| 1580/1580 [27:29<00:00,  1.10it/s]
100%|██████████| 1580/1580 [27:34<00:00,  1.10it/s]

{'eval_loss': 26.646480560302734, 'eval_mse': 26.6464786529541, 'eval_mae': 3.9952738285064697, 'eval_r2': 0.6358135829396709, 'eval_runtime': 5.0345, 'eval_samples_per_second': 53.829, 'eval_steps_per_second': 3.377, 'epoch': 20.0}


100%|██████████| 1580/1580 [27:36<00:00,  1.05s/it]


{'train_runtime': 1656.2595, 'train_samples_per_second': 15.239, 'train_steps_per_second': 0.954, 'train_loss': 9.993685130831562, 'epoch': 20.0}


100%|██████████| 17/17 [00:04<00:00,  3.63it/s]


start working on fagreeableness


Map: 100%|██████████| 1804/1804 [00:02<00:00, 763.99 examples/s]
  5%|▌         | 79/1580 [01:16<23:41,  1.06it/s] 
  5%|▌         | 79/1580 [01:21<23:41,  1.06it/s]

{'eval_loss': 31.61092758178711, 'eval_mse': 31.61092758178711, 'eval_mae': 4.600525379180908, 'eval_r2': 0.5653215709070725, 'eval_runtime': 5.0083, 'eval_samples_per_second': 54.11, 'eval_steps_per_second': 3.394, 'epoch': 1.0}


 10%|█         | 158/1580 [02:37<22:17,  1.06it/s] 
 10%|█         | 158/1580 [02:42<22:17,  1.06it/s]

{'eval_loss': 25.9593563079834, 'eval_mse': 25.9593563079834, 'eval_mae': 4.244950294494629, 'eval_r2': 0.6430357609610896, 'eval_runtime': 5.0367, 'eval_samples_per_second': 53.805, 'eval_steps_per_second': 3.375, 'epoch': 2.0}


 15%|█▌        | 237/1580 [03:58<20:14,  1.11it/s]  
 15%|█▌        | 237/1580 [04:03<20:14,  1.11it/s]

{'eval_loss': 35.288997650146484, 'eval_mse': 35.288997650146484, 'eval_mae': 4.880640983581543, 'eval_r2': 0.5147448574937213, 'eval_runtime': 5.0518, 'eval_samples_per_second': 53.645, 'eval_steps_per_second': 3.365, 'epoch': 3.0}


 20%|██        | 316/1580 [05:20<19:01,  1.11it/s]  
 20%|██        | 316/1580 [05:25<19:01,  1.11it/s]

{'eval_loss': 24.443571090698242, 'eval_mse': 24.44356918334961, 'eval_mae': 4.063493728637695, 'eval_r2': 0.6638791381831188, 'eval_runtime': 5.0577, 'eval_samples_per_second': 53.582, 'eval_steps_per_second': 3.361, 'epoch': 4.0}


 25%|██▌       | 395/1580 [06:42<17:59,  1.10it/s]  
 25%|██▌       | 395/1580 [06:47<17:59,  1.10it/s]

{'eval_loss': 23.323583602905273, 'eval_mse': 23.32358741760254, 'eval_mae': 3.923997402191162, 'eval_r2': 0.6792799013700241, 'eval_runtime': 5.0413, 'eval_samples_per_second': 53.756, 'eval_steps_per_second': 3.372, 'epoch': 5.0}


 30%|███       | 474/1580 [08:03<16:39,  1.11it/s]  
 30%|███       | 474/1580 [08:08<16:39,  1.11it/s]

{'eval_loss': 24.26069450378418, 'eval_mse': 24.260696411132812, 'eval_mae': 4.078507423400879, 'eval_r2': 0.6663938116971054, 'eval_runtime': 5.0399, 'eval_samples_per_second': 53.771, 'eval_steps_per_second': 3.373, 'epoch': 6.0}


 32%|███▏      | 500/1580 [08:35<16:50,  1.07it/s]

{'loss': 25.7401, 'learning_rate': 1.3670886075949368e-05, 'epoch': 6.33}


 35%|███▌      | 553/1580 [09:25<15:25,  1.11it/s]
 35%|███▌      | 553/1580 [09:30<15:25,  1.11it/s]

{'eval_loss': 23.0474910736084, 'eval_mse': 23.04749298095703, 'eval_mae': 3.912661552429199, 'eval_r2': 0.6830764288298408, 'eval_runtime': 5.0225, 'eval_samples_per_second': 53.957, 'eval_steps_per_second': 3.385, 'epoch': 7.0}


 40%|████      | 632/1580 [10:47<14:14,  1.11it/s]
 40%|████      | 632/1580 [10:52<14:14,  1.11it/s]

{'eval_loss': 25.1740665435791, 'eval_mse': 25.17406463623047, 'eval_mae': 4.051014423370361, 'eval_r2': 0.6538341677707086, 'eval_runtime': 5.006, 'eval_samples_per_second': 54.135, 'eval_steps_per_second': 3.396, 'epoch': 8.0}


 45%|████▌     | 711/1580 [12:07<13:00,  1.11it/s]
 45%|████▌     | 711/1580 [12:12<13:00,  1.11it/s]

{'eval_loss': 26.473445892333984, 'eval_mse': 26.473445892333984, 'eval_mae': 4.134324073791504, 'eval_r2': 0.6359665619843603, 'eval_runtime': 4.9861, 'eval_samples_per_second': 54.351, 'eval_steps_per_second': 3.409, 'epoch': 9.0}


 50%|█████     | 790/1580 [13:29<11:54,  1.11it/s]
 50%|█████     | 790/1580 [13:34<11:54,  1.11it/s]

{'eval_loss': 26.128419876098633, 'eval_mse': 26.128419876098633, 'eval_mae': 4.124363899230957, 'eval_r2': 0.6407109410765506, 'eval_runtime': 5.0177, 'eval_samples_per_second': 54.009, 'eval_steps_per_second': 3.388, 'epoch': 10.0}


 55%|█████▌    | 869/1580 [14:49<10:54,  1.09it/s]
 55%|█████▌    | 869/1580 [14:55<10:54,  1.09it/s]

{'eval_loss': 24.135887145996094, 'eval_mse': 24.13588523864746, 'eval_mae': 3.930624485015869, 'eval_r2': 0.6681100873048094, 'eval_runtime': 5.234, 'eval_samples_per_second': 51.777, 'eval_steps_per_second': 3.248, 'epoch': 11.0}


 60%|██████    | 948/1580 [16:19<10:55,  1.04s/it]
 60%|██████    | 948/1580 [16:25<10:55,  1.04s/it]

{'eval_loss': 25.39498519897461, 'eval_mse': 25.39498519897461, 'eval_mae': 4.056507110595703, 'eval_r2': 0.6507963240185182, 'eval_runtime': 5.2453, 'eval_samples_per_second': 51.665, 'eval_steps_per_second': 3.241, 'epoch': 12.0}


 63%|██████▎   | 1000/1580 [17:23<12:41,  1.31s/it]

{'loss': 7.1657, 'learning_rate': 7.341772151898735e-06, 'epoch': 12.66}


 65%|██████▌   | 1027/1580 [17:52<09:44,  1.06s/it]
 65%|██████▌   | 1027/1580 [17:58<09:44,  1.06s/it]

{'eval_loss': 25.44738006591797, 'eval_mse': 25.44738006591797, 'eval_mae': 4.073889255523682, 'eval_r2': 0.650075831930891, 'eval_runtime': 5.6362, 'eval_samples_per_second': 48.082, 'eval_steps_per_second': 3.016, 'epoch': 13.0}


 70%|███████   | 1106/1580 [19:27<08:18,  1.05s/it]
 70%|███████   | 1106/1580 [19:32<08:18,  1.05s/it]

{'eval_loss': 25.058504104614258, 'eval_mse': 25.058504104614258, 'eval_mae': 4.019362926483154, 'eval_r2': 0.6554232482283654, 'eval_runtime': 5.3157, 'eval_samples_per_second': 50.981, 'eval_steps_per_second': 3.198, 'epoch': 14.0}


 75%|███████▌  | 1185/1580 [20:59<06:46,  1.03s/it]
 75%|███████▌  | 1185/1580 [21:04<06:46,  1.03s/it]

{'eval_loss': 24.875864028930664, 'eval_mse': 24.875865936279297, 'eval_mae': 4.000621795654297, 'eval_r2': 0.6579346586618198, 'eval_runtime': 5.4656, 'eval_samples_per_second': 49.583, 'eval_steps_per_second': 3.11, 'epoch': 15.0}


 80%|████████  | 1264/1580 [22:31<05:29,  1.04s/it]
 80%|████████  | 1264/1580 [22:37<05:29,  1.04s/it]

{'eval_loss': 24.249130249023438, 'eval_mse': 24.24913215637207, 'eval_mae': 3.985459566116333, 'eval_r2': 0.6665528303888654, 'eval_runtime': 5.4471, 'eval_samples_per_second': 49.751, 'eval_steps_per_second': 3.121, 'epoch': 16.0}


 85%|████████▌ | 1343/1580 [24:04<04:07,  1.04s/it]
 85%|████████▌ | 1343/1580 [24:10<04:07,  1.04s/it]

{'eval_loss': 22.352067947387695, 'eval_mse': 22.352067947387695, 'eval_mae': 3.763871669769287, 'eval_r2': 0.6926391535044554, 'eval_runtime': 5.7311, 'eval_samples_per_second': 47.286, 'eval_steps_per_second': 2.966, 'epoch': 17.0}


 90%|█████████ | 1422/1580 [25:40<02:54,  1.10s/it]
 90%|█████████ | 1422/1580 [25:46<02:54,  1.10s/it]

{'eval_loss': 23.11667251586914, 'eval_mse': 23.116674423217773, 'eval_mae': 3.8846781253814697, 'eval_r2': 0.682125149152688, 'eval_runtime': 5.6539, 'eval_samples_per_second': 47.931, 'eval_steps_per_second': 3.007, 'epoch': 18.0}


 95%|█████████▍| 1500/1580 [27:13<01:26,  1.08s/it]

{'loss': 4.8055, 'learning_rate': 1.0126582278481013e-06, 'epoch': 18.99}


 95%|█████████▌| 1501/1580 [27:14<01:22,  1.05s/it]
 95%|█████████▌| 1501/1580 [27:19<01:22,  1.05s/it]

{'eval_loss': 23.26560401916504, 'eval_mse': 23.265605926513672, 'eval_mae': 3.8768017292022705, 'eval_r2': 0.6800772059120358, 'eval_runtime': 5.534, 'eval_samples_per_second': 48.97, 'eval_steps_per_second': 3.072, 'epoch': 19.0}


100%|██████████| 1580/1580 [28:44<00:00,  1.06it/s]
100%|██████████| 1580/1580 [28:49<00:00,  1.06it/s]

{'eval_loss': 24.918832778930664, 'eval_mse': 24.91883087158203, 'eval_mae': 4.016841411590576, 'eval_r2': 0.6573438478734592, 'eval_runtime': 5.0558, 'eval_samples_per_second': 53.602, 'eval_steps_per_second': 3.362, 'epoch': 20.0}


100%|██████████| 1580/1580 [28:50<00:00,  1.10s/it]


{'train_runtime': 1730.886, 'train_samples_per_second': 14.582, 'train_steps_per_second': 0.913, 'train_loss': 12.148155366921726, 'epoch': 20.0}


100%|██████████| 17/17 [00:04<00:00,  3.57it/s]


start working on fextraversion


Map: 100%|██████████| 1804/1804 [00:02<00:00, 750.54 examples/s]
  5%|▌         | 79/1580 [01:18<23:47,  1.05it/s] 
  5%|▌         | 79/1580 [01:23<23:47,  1.05it/s]

{'eval_loss': 39.29164505004883, 'eval_mse': 39.29164123535156, 'eval_mae': 5.000274658203125, 'eval_r2': 0.6150519251182355, 'eval_runtime': 5.0972, 'eval_samples_per_second': 53.166, 'eval_steps_per_second': 3.335, 'epoch': 1.0}


 10%|█         | 158/1580 [02:41<21:53,  1.08it/s] 
 10%|█         | 158/1580 [02:46<21:53,  1.08it/s]

{'eval_loss': 35.97232437133789, 'eval_mse': 35.97232437133789, 'eval_mae': 4.774276256561279, 'eval_r2': 0.6475719225437034, 'eval_runtime': 5.2012, 'eval_samples_per_second': 52.104, 'eval_steps_per_second': 3.268, 'epoch': 2.0}


 15%|█▌        | 237/1580 [04:04<20:53,  1.07it/s]  
 15%|█▌        | 237/1580 [04:10<20:53,  1.07it/s]

{'eval_loss': 33.66543960571289, 'eval_mse': 33.66543960571289, 'eval_mae': 4.572603225708008, 'eval_r2': 0.6701729524571287, 'eval_runtime': 5.1233, 'eval_samples_per_second': 52.896, 'eval_steps_per_second': 3.318, 'epoch': 3.0}


 20%|██        | 316/1580 [05:27<19:07,  1.10it/s]  
 20%|██        | 316/1580 [05:32<19:07,  1.10it/s]

{'eval_loss': 31.28281021118164, 'eval_mse': 31.282814025878906, 'eval_mae': 4.37124490737915, 'eval_r2': 0.6935160449995379, 'eval_runtime': 5.0496, 'eval_samples_per_second': 53.668, 'eval_steps_per_second': 3.367, 'epoch': 4.0}


 25%|██▌       | 395/1580 [06:48<17:46,  1.11it/s]  
 25%|██▌       | 395/1580 [06:53<17:46,  1.11it/s]

{'eval_loss': 31.999845504760742, 'eval_mse': 31.999849319458008, 'eval_mae': 4.433535099029541, 'eval_r2': 0.6864911068637716, 'eval_runtime': 5.0138, 'eval_samples_per_second': 54.051, 'eval_steps_per_second': 3.391, 'epoch': 5.0}


 30%|███       | 474/1580 [08:10<16:38,  1.11it/s]
 30%|███       | 474/1580 [08:15<16:38,  1.11it/s]

{'eval_loss': 32.1438102722168, 'eval_mse': 32.1438102722168, 'eval_mae': 4.419611930847168, 'eval_r2': 0.6850806532262907, 'eval_runtime': 5.0046, 'eval_samples_per_second': 54.15, 'eval_steps_per_second': 3.397, 'epoch': 6.0}


 32%|███▏      | 500/1580 [08:41<16:55,  1.06it/s]

{'loss': 20.4731, 'learning_rate': 1.3670886075949368e-05, 'epoch': 6.33}


 35%|███▌      | 553/1580 [09:31<15:39,  1.09it/s]
 35%|███▌      | 553/1580 [09:36<15:39,  1.09it/s]

{'eval_loss': 32.24626541137695, 'eval_mse': 32.24625778198242, 'eval_mae': 4.405122756958008, 'eval_r2': 0.6840769317416477, 'eval_runtime': 5.0275, 'eval_samples_per_second': 53.904, 'eval_steps_per_second': 3.381, 'epoch': 7.0}


 40%|████      | 632/1580 [10:52<14:13,  1.11it/s]
 40%|████      | 632/1580 [10:57<14:13,  1.11it/s]

{'eval_loss': 32.436988830566406, 'eval_mse': 32.436981201171875, 'eval_mae': 4.422018051147461, 'eval_r2': 0.6822083976583381, 'eval_runtime': 5.0309, 'eval_samples_per_second': 53.867, 'eval_steps_per_second': 3.379, 'epoch': 8.0}


 45%|████▌     | 711/1580 [12:13<13:07,  1.10it/s]
 45%|████▌     | 711/1580 [12:18<13:07,  1.10it/s]

{'eval_loss': 33.72550964355469, 'eval_mse': 33.72550964355469, 'eval_mae': 4.595918655395508, 'eval_r2': 0.6695844457822145, 'eval_runtime': 5.0207, 'eval_samples_per_second': 53.976, 'eval_steps_per_second': 3.386, 'epoch': 9.0}


 50%|█████     | 790/1580 [14:03<15:53,  1.21s/it]
 50%|█████     | 790/1580 [14:08<15:53,  1.21s/it]

{'eval_loss': 31.877750396728516, 'eval_mse': 31.87774658203125, 'eval_mae': 4.3968706130981445, 'eval_r2': 0.6876873394968106, 'eval_runtime': 5.4506, 'eval_samples_per_second': 49.72, 'eval_steps_per_second': 3.119, 'epoch': 10.0}


 55%|█████▌    | 869/1580 [15:52<14:51,  1.25s/it]  
 55%|█████▌    | 869/1580 [15:58<14:51,  1.25s/it]

{'eval_loss': 30.720796585083008, 'eval_mse': 30.720792770385742, 'eval_mae': 4.296839714050293, 'eval_r2': 0.6990222382388247, 'eval_runtime': 5.8321, 'eval_samples_per_second': 46.467, 'eval_steps_per_second': 2.915, 'epoch': 11.0}


 60%|██████    | 948/1580 [17:33<10:01,  1.05it/s]
 60%|██████    | 948/1580 [17:38<10:01,  1.05it/s]

{'eval_loss': 31.500308990478516, 'eval_mse': 31.500308990478516, 'eval_mae': 4.338078022003174, 'eval_r2': 0.6913851720316964, 'eval_runtime': 5.1701, 'eval_samples_per_second': 52.416, 'eval_steps_per_second': 3.288, 'epoch': 12.0}


 63%|██████▎   | 1000/1580 [18:43<12:37,  1.31s/it]

{'loss': 6.2246, 'learning_rate': 7.341772151898735e-06, 'epoch': 12.66}


 65%|██████▌   | 1027/1580 [19:20<16:58,  1.84s/it]
 65%|██████▌   | 1027/1580 [19:25<16:58,  1.84s/it]

{'eval_loss': 31.35832977294922, 'eval_mse': 31.35832977294922, 'eval_mae': 4.359687805175781, 'eval_r2': 0.6927761909637401, 'eval_runtime': 5.4988, 'eval_samples_per_second': 49.284, 'eval_steps_per_second': 3.092, 'epoch': 13.0}


 70%|███████   | 1106/1580 [20:52<07:36,  1.04it/s]
 70%|███████   | 1106/1580 [20:57<07:36,  1.04it/s]

{'eval_loss': 31.711963653564453, 'eval_mse': 31.71196746826172, 'eval_mae': 4.393347263336182, 'eval_r2': 0.6893115427242718, 'eval_runtime': 5.1435, 'eval_samples_per_second': 52.688, 'eval_steps_per_second': 3.305, 'epoch': 14.0}


 75%|███████▌  | 1185/1580 [22:18<06:15,  1.05it/s]
 75%|███████▌  | 1185/1580 [22:23<06:15,  1.05it/s]

{'eval_loss': 31.27242088317871, 'eval_mse': 31.272424697875977, 'eval_mae': 4.317919731140137, 'eval_r2': 0.6936178222674163, 'eval_runtime': 5.1156, 'eval_samples_per_second': 52.975, 'eval_steps_per_second': 3.323, 'epoch': 15.0}


 80%|████████  | 1264/1580 [23:43<05:01,  1.05it/s]
 80%|████████  | 1264/1580 [23:49<05:01,  1.05it/s]

{'eval_loss': 30.99439239501953, 'eval_mse': 30.99439239501953, 'eval_mae': 4.320461750030518, 'eval_r2': 0.6963417407649478, 'eval_runtime': 5.0857, 'eval_samples_per_second': 53.287, 'eval_steps_per_second': 3.343, 'epoch': 16.0}


 85%|████████▌ | 1343/1580 [25:21<04:31,  1.15s/it]
 85%|████████▌ | 1343/1580 [25:27<04:31,  1.15s/it]

{'eval_loss': 30.64968490600586, 'eval_mse': 30.64968490600586, 'eval_mae': 4.320408344268799, 'eval_r2': 0.6997189316016947, 'eval_runtime': 5.2359, 'eval_samples_per_second': 51.758, 'eval_steps_per_second': 3.247, 'epoch': 17.0}


 90%|█████████ | 1422/1580 [26:58<02:38,  1.00s/it]
 90%|█████████ | 1422/1580 [27:03<02:38,  1.00s/it]

{'eval_loss': 30.796072006225586, 'eval_mse': 30.79606819152832, 'eval_mae': 4.310082912445068, 'eval_r2': 0.6982847618896597, 'eval_runtime': 5.2909, 'eval_samples_per_second': 51.22, 'eval_steps_per_second': 3.213, 'epoch': 18.0}


 95%|█████████▍| 1500/1580 [28:27<01:19,  1.00it/s]

{'loss': 4.4225, 'learning_rate': 1.0126582278481013e-06, 'epoch': 18.99}


 95%|█████████▌| 1501/1580 [28:28<01:17,  1.02it/s]
 95%|█████████▌| 1501/1580 [28:33<01:17,  1.02it/s]

{'eval_loss': 30.88636589050293, 'eval_mse': 30.88636589050293, 'eval_mae': 4.31040620803833, 'eval_r2': 0.6974000921869361, 'eval_runtime': 5.1452, 'eval_samples_per_second': 52.671, 'eval_steps_per_second': 3.304, 'epoch': 19.0}


100%|██████████| 1580/1580 [30:02<00:00,  1.11s/it]
100%|██████████| 1580/1580 [30:08<00:00,  1.11s/it]

{'eval_loss': 31.108734130859375, 'eval_mse': 31.10873794555664, 'eval_mae': 4.342731952667236, 'eval_r2': 0.6952215010735697, 'eval_runtime': 5.3228, 'eval_samples_per_second': 50.913, 'eval_steps_per_second': 3.194, 'epoch': 20.0}


100%|██████████| 1580/1580 [30:09<00:00,  1.15s/it]


{'train_runtime': 1809.8704, 'train_samples_per_second': 13.946, 'train_steps_per_second': 0.873, 'train_loss': 10.058879919897151, 'epoch': 20.0}


100%|██████████| 17/17 [00:04<00:00,  3.49it/s]


In [8]:
import pickle
import pandas as pd
keys = eval_ret.keys()
d = {}
for k in keys:
    d[k]=[]
d['trait'] = []

traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']
for focused_trait in traits:
        with open(f'../models/fine-tuned-regression-{focused_trait}-training-text-350/eval_result.pkl', 'rb') as fp:
                eval_ret = pickle.load(fp)
                for k in eval_ret.keys():
                        d[k].append(eval_ret[k])
                d['trait'].append(focused_trait)
        

In [10]:
pd.DataFrame(d).to_csv("eval_result_text_350.csv")

In [11]:
# compare previous vs new model
new_training = pd.DataFrame(d)
old_training = pd.read_csv("eval_result.csv")

In [19]:
col = ['trait','eval_r2']
compare_df = old_training[col].merge(new_training[col], on="trait",suffixes=("_120","_350"))
compare_df['abs_chg'] = compare_df['eval_r2_350']-compare_df['eval_r2_120']
compare_df['pct_chg'] = (compare_df['eval_r2_350']-compare_df['eval_r2_120'])/compare_df['eval_r2_120']*100

In [21]:
compare_df.to_csv("improvement.csv")

## batch inference

In [45]:
from collections import defaultdict
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# using disilroberta as a quick baseline model
# https://huggingface.co/docs/transformers/tasks/sequence_classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)


def preprocess_for_inference_dataset(examples):
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    return examples


all_user_text_df = pd.read_parquet("liwc_dataset_user_max_425.parquet")
ds_inference = Dataset.from_dict(all_user_text_df.to_dict('list'))
tokenized_ds_inference = ds_inference.map(preprocess_for_inference_dataset, remove_columns=['post_count','word_count','user'])

FileNotFoundError: [Errno 2] No such file or directory: 'liwc_dataset_user_max_425.parquet'

In [36]:
# loading the model previously trained
model_paths = [
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-agreeableness-training-text-350/checkpoint-1343",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-conscientiousness-training-text-350/checkpoint-1580",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-extraversion-training-text-350/checkpoint-1343",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-neuroticism-training-text-350/checkpoint-711",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-openness-training-text-350/checkpoint-1343"]
d = {}

for model_path in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    trait = model_path.split(sep="/")[-2].split(sep="-")[-4]
    # arguments for Trainer
    test_args = TrainingArguments(
        output_dir = model_path,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 50,   
        dataloader_drop_last = False    
    )

    # init trainer
    trainer = Trainer(
        model = model, 
        args = test_args, 
        compute_metrics = compute_metrics_for_regression
    )

    predictions = trainer.predict(tokenized_ds_inference)

    predictions_df = pd.DataFrame(predictions.predictions)
    d[trait]=predictions_df[0].tolist()

100%|██████████| 286/286 [04:40<00:00,  1.02it/s]
100%|██████████| 286/286 [04:47<00:00,  1.01s/it]
100%|██████████| 286/286 [04:38<00:00,  1.03it/s]
100%|██████████| 286/286 [05:03<00:00,  1.06s/it]
100%|██████████| 286/286 [04:41<00:00,  1.02it/s]


In [37]:
predictions_df = pd.DataFrame(d)

In [38]:
predictions_df

Unnamed: 0,agreeableness,conscientiousness,extraversion,neuroticism,openness
0,62.584175,53.016537,62.961075,33.943050,48.623798
1,52.349983,54.205620,53.372646,53.894497,53.370544
2,64.413727,60.303745,63.131458,32.397472,55.495106
3,55.095215,45.208984,56.746407,36.411385,46.324245
4,38.849571,48.048588,44.902611,51.545254,50.926304
...,...,...,...,...,...
14282,71.958252,60.066216,69.318344,31.210674,45.827385
14283,62.504547,57.361919,62.434406,38.509449,44.573627
14284,59.323158,59.692585,68.001259,41.594982,45.992832
14285,55.709034,44.886978,54.742641,43.495834,44.475941


In [39]:
all_user_text_df

Unnamed: 0,user,text,post_count,word_count
0,ariannyceleste,Raidens Mom 👶🏽🐶🐶🧿 Founder @girlfriendbox @them...,25,264
1,sarahstage,Boy Mom 👶🏻👦🏻 Wife 💃🏻 & Online Fitness Coach 💪🏼...,25,412
2,beauty_nurse_elizabeth,Aesthetic RN-BSN •Injection Artist 🌹 •Natural ...,25,383
3,carlyrbel,One of a kind ✨ • @carlybelx • shop @carlyclub...,25,142
4,biolayne,PhD Nutrition Science 💍@hollytbaxter @carbondi...,25,249
...,...,...,...,...
14282,lianev,Business inquiries Jennifer@thejgoagency.com T...,25,396
14283,kuz,Vino @lakers inquiries@kylekuzma.com @puma Ath...,25,281
14284,tanamongeau,BLACK. LIVES. MATTER. LINK IN BIO TO HELP; @th...,25,379
14285,lilyachty,no stylist; Itâ€™s us!; no stylist; It’s us!;;...,10,72


In [40]:
user_text_prediction_df = all_user_text_df.merge(predictions_df,left_index=True, right_index=True)

In [41]:
user_text_prediction_df.to_csv("all_predictions_word_350.csv")

In [43]:
# Convert the 0-100 numeric score into 1-10 categories.
traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']
for trait in traits:
    user_text_prediction_df[f'{trait}_prediction'] = pd.cut(user_text_prediction_df[trait], 10, labels=[n for n in range(1, 11)])
#pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

In [46]:
columns = ['user','conscientiousness_prediction', 'openness_prediction', 'neuroticism_prediction','agreeableness_prediction', 'extraversion_prediction']
user_text_prediction_df.rename(columns={
    "conscientiousness_categorized":"conscientiousness_prediction",
    "openness_categorized":"openness_prediction",
    "neuroticism_categorized":"neuroticism_prediction",
    "agreeableness_categorized":"agreeableness_prediction",
    "extraversion_categorized":"extraversion_prediction"},inplace=True)


In [67]:
import os 
import pandas as pd

files = os.listdir("../source_data")
files.sort()

df_women_event = pd.read_csv(os.path.join("../source_data", files[0]), sep="\t")
df_buy_insta_accounts_add_on = pd.read_csv(os.path.join("../source_data", files[1]), sep="\t")
df_buy_biz = pd.read_csv(os.path.join("../source_data", files[2]), sep="\t")
df_buy_insta_accounts = pd.read_csv(os.path.join("../source_data", files[3]), sep="\t")
df_women_event_mined = pd.read_csv(os.path.join("../source_data", files[4]), sep="\t")

In [52]:
user_text_prediction_df

Unnamed: 0,user,text,post_count,word_count,agreeableness,conscientiousness,extraversion,neuroticism,openness,conscientiousness_prediction,openness_prediction,neuroticism_prediction,agreeableness_prediction,extraversion_prediction
0,ariannyceleste,Raidens Mom 👶🏽🐶🐶🧿 Founder @girlfriendbox @them...,25,264,62.584175,53.016537,62.961075,33.943050,48.623798,9,8,5,9,8
1,sarahstage,Boy Mom 👶🏻👦🏻 Wife 💃🏻 & Online Fitness Coach 💪🏼...,25,412,52.349983,54.205620,53.372646,53.894497,53.370544,9,9,9,7,6
2,beauty_nurse_elizabeth,Aesthetic RN-BSN •Injection Artist 🌹 •Natural ...,25,383,64.413727,60.303745,63.131458,32.397472,55.495106,10,9,5,9,8
3,carlyrbel,One of a kind ✨ • @carlybelx • shop @carlyclub...,25,142,55.095215,45.208984,56.746407,36.411385,46.324245,8,7,6,7,7
4,biolayne,PhD Nutrition Science 💍@hollytbaxter @carbondi...,25,249,38.849571,48.048588,44.902611,51.545254,50.926304,8,8,9,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14282,lianev,Business inquiries Jennifer@thejgoagency.com T...,25,396,71.958252,60.066216,69.318344,31.210674,45.827385,10,7,4,10,9
14283,kuz,Vino @lakers inquiries@kylekuzma.com @puma Ath...,25,281,62.504547,57.361919,62.434406,38.509449,44.573627,10,7,6,9,8
14284,tanamongeau,BLACK. LIVES. MATTER. LINK IN BIO TO HELP; @th...,25,379,59.323158,59.692585,68.001259,41.594982,45.992832,10,7,7,8,9
14285,lilyachty,no stylist; Itâ€™s us!; no stylist; It’s us!;;...,10,72,55.709034,44.886978,54.742641,43.495834,44.475941,7,7,7,7,7


In [53]:
df_women_event = df_women_event.merge(df_women_event[columns], left_on="username", right_on="user",how="left")

In [54]:
df_buy_insta_accounts_add_on=df_buy_insta_accounts_add_on.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [55]:
df_buy_biz=df_buy_biz.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [56]:
df_buy_insta_accounts = df_buy_insta_accounts.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [57]:
df_women_event_mined = df_women_event_mined.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [58]:
df_women_event.to_csv("df_women_event_prediction_350.csv",index=False)
df_buy_insta_accounts_add_on.to_csv("df_buy_insta_accounts_add_on_prediction_350.csv",index=False)
df_buy_biz.to_csv("df_buy_biz_prediction_350.csv",index=False)
df_buy_insta_accounts.to_csv("df_buy_insta_accounts_prediction_350.csv",index=False)
df_women_event_mined.to_csv("df_women_event_mined_prediction_350.csv",index=False)

## Do inference for users without posts/captions

In [9]:
import pandas as pd
post_df = pd.read_parquet("../concat_feature.parquet")

In [14]:
import os

files = os.listdir("../source_data")
files.sort()

df_women_event = pd.read_csv(os.path.join("../source_data", files[0]), sep="\t")
df_buy_insta_accounts_add_on = pd.read_csv(os.path.join("../source_data", files[1]), sep="\t")
df_buy_biz = pd.read_csv(os.path.join("../source_data", files[2]), sep="\t")
df_buy_insta_accounts = pd.read_csv(os.path.join("../source_data", files[3]), sep="\t")
df_women_event_mined = pd.read_csv(os.path.join("../source_data", files[4]), sep="\t")

In [35]:
df_women_event.columns

Index(['username', 'media_count', 'full_name', 'following_count',
       'follower_count', 'first_name', 'external_url', 'engagement_percent',
       'email', 'contact_phone_number', 'cityName', 'category', 'biography',
       'address_street', 'MostRecentPostDate', 'AvgLikes', 'AvgComments'],
      dtype='object')

In [42]:
missing_users_df = pd.concat([df_women_event[~df_women_event.username.isin(user_text_prediction_df.user)][['username', 'biography']],
           df_buy_insta_accounts_add_on[~df_buy_insta_accounts_add_on.username.isin(user_text_prediction_df.user)][['username', 'biography']],
           df_buy_biz[~df_buy_biz.username.isin(user_text_prediction_df.user)][['username', 'biography']],
           df_buy_insta_accounts[~df_buy_insta_accounts.username.isin(user_text_prediction_df.user)][['username', 'biography']],
           df_women_event_mined[~df_women_event_mined.username.isin(user_text_prediction_df.user)][['username', 'biography']]
           ])
missing_users_df.rename(columns={'biography':'text'}, inplace=True)
missing_users_df = missing_users_df.reset_index()

In [53]:
missing_users_df = missing_users_df.dropna()

In [55]:
from collections import defaultdict
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# using disilroberta as a quick baseline model
# https://huggingface.co/docs/transformers/tasks/sequence_classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)


def preprocess_for_inference_dataset(examples):
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    return examples


ds_inference = Dataset.from_pandas(missing_users_df[['text']])
tokenized_ds_inference = ds_inference.map(preprocess_for_inference_dataset)

Map: 100%|██████████| 6636/6636 [00:01<00:00, 5717.30 examples/s]


In [60]:
# loading the model previously trained
model_paths = [
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-agreeableness-training-text-350/checkpoint-1343",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-conscientiousness-training-text-350/checkpoint-1580",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-extraversion-training-text-350/checkpoint-1343",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-neuroticism-training-text-350/checkpoint-711",
    "/Users/bohaocao/Codebase/uw-ocean/3.liwc_based_ft/models_350words/fine-tuned-regression-openness-training-text-350/checkpoint-1343"]
d = {}

for model_path in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    trait = model_path.split(sep="/")[-2].split(sep="-")[-4]
    # arguments for Trainer
    test_args = TrainingArguments(
        output_dir = model_path,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 50,   
        dataloader_drop_last = False    
    )

    # init trainer
    trainer = Trainer(
        model = model, 
        args = test_args, 
        compute_metrics = compute_metrics_for_regression
    )

    predictions = trainer.predict(tokenized_ds_inference)

    predictions_df = pd.DataFrame(predictions.predictions)
    d[trait]=predictions_df[0].tolist()

100%|██████████| 133/133 [02:08<00:00,  1.03it/s]
100%|██████████| 133/133 [02:04<00:00,  1.07it/s]
100%|██████████| 133/133 [02:07<00:00,  1.05it/s]
100%|██████████| 133/133 [02:05<00:00,  1.06it/s]
100%|██████████| 133/133 [02:08<00:00,  1.04it/s]


In [61]:
predictions_df = pd.DataFrame(d)

In [62]:
predictions_df

Unnamed: 0,agreeableness,conscientiousness,extraversion,neuroticism,openness
0,54.637936,48.007416,55.397968,34.497192,46.734192
1,52.568741,49.551327,54.891148,30.783730,46.234470
2,52.902840,47.403782,55.209686,35.656410,50.825588
3,53.144043,44.334667,55.492092,46.626801,47.678799
4,51.645348,43.946667,52.536003,27.142527,44.948498
...,...,...,...,...,...
6631,54.589378,42.740208,52.708797,31.985081,42.446053
6632,54.159027,38.006592,53.951298,27.847933,42.237133
6633,52.914593,36.122673,45.780075,28.005756,35.301163
6634,55.039314,49.554672,53.998188,36.343853,45.902111


In [63]:
missing_users_prediction_df = missing_users_df.merge(predictions_df,left_index=True, right_index=True)

In [66]:
missing_users_prediction_df

Unnamed: 0,index,username,text,agreeableness,conscientiousness,extraversion,neuroticism,openness,conscientiousness_prediction,openness_prediction,neuroticism_prediction,agreeableness_prediction,extraversion_prediction
0,4,houdaschimmel,Founder @shimmypetsalon @shimmyschimmel 🇲🇦|🇺🇸,54.637936,48.007416,55.397968,34.497192,46.734192,8,8,6,8,8
1,11,brookemasonphoto,"High Profile: Photo, Branding, Strategy Expert...",52.568741,49.551327,54.891148,30.783730,46.234470,8,8,5,7,8
2,20,jollymollydrip,👽CEO @jollymolly.custom 👾Modern Art Deep Minde...,52.902840,47.403782,55.209686,35.656410,50.825588,8,9,6,7,8
3,24,cassydy,"“Yes, that’s really how it’s spelled!”⁣⁣⁣⁣ Wit...",53.144043,44.334667,55.492092,46.626801,47.678799,7,8,9,7,8
4,26,katealuz,Los Angeles 🥰📍 Modelo internacional Vidente Te...,51.645348,43.946667,52.536003,27.142527,44.948498,7,8,4,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6630,10068,alli.mariant,💍Wife 🏃‍♂️🏃‍♂️🏃‍♀️Mommy of 3 💪🔥Team FitFire 💎D...,53.732468,45.528194,56.044453,28.728277,44.744968,8,8,5,7,8
6631,10069,stephanie_stack,Design-Creative Director.,54.589378,42.740208,52.708797,31.985081,42.446053,7,7,5,8,7
6632,10070,_goldendelicious,EMPOWERED WOMEN EMPOWER WOMEN Mother•Health Co...,54.159027,38.006592,53.951298,27.847933,42.237133,6,7,4,7,8
6633,10074,_unidegar,|SC🌴| NOLA⚜️| ATL 🍑 🇵🇦 @alignedstarsagency @do...,52.914593,36.122673,45.780075,28.005756,35.301163,6,6,4,7,6


In [65]:
# Convert the 0-100 numeric score into 1-10 categories.
traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']
for trait in traits:
    missing_users_prediction_df[f'{trait}_prediction'] = pd.cut(missing_users_prediction_df[trait], 10, labels=[n for n in range(1, 11)])


In [107]:
df_women_event = pd.read_csv("liwc_results_350/df_women_event_prediction_350.csv")
df_buy_insta_accounts_add_on = pd.read_csv("liwc_results_350/df_buy_insta_accounts_add_on_prediction_350.csv")
df_buy_biz = pd.read_csv("liwc_results_350/df_buy_biz_prediction_350.csv")
df_buy_insta_accounts = pd.read_csv("liwc_results_350/df_buy_insta_accounts_prediction_350.csv")
df_women_event_mined = pd.read_csv("liwc_results_350/df_women_event_mined_prediction_350.csv")

In [82]:
missing_users_prediction_df.rename(columns={
    "conscientiousness_categorized":"conscientiousness_prediction",
    "openness_categorized":"openness_prediction",
    "neuroticism_categorized":"neuroticism_prediction",
    "agreeableness_categorized":"agreeableness_prediction",
    "extraversion_categorized":"extraversion_prediction"},inplace=True)

In [99]:
df_women_event_missing_user = df_women_event[df_women_event['conscientiousness_prediction'].isna()][columns_2].merge(missing_users_prediction_df[columns], left_on="username", right_on="username",how="left")

In [105]:
df_women_event = pd.concat([df_women_event[~df_women_event['conscientiousness_prediction'].isna()], df_women_event_missing_user]).drop_duplicates()

In [106]:
df_women_event.to_csv("liwc_results_350/df_women_event_prediction_350_with_no_post_users.csv")

In [116]:
columns = ['username','conscientiousness_prediction', 'openness_prediction', 'neuroticism_prediction','agreeableness_prediction', 'extraversion_prediction'] 
columns_2 = ['username', 'media_count', 'full_name', 'following_count',
       'follower_count', 'first_name', 'external_url', 'engagement_percent',
       'email', 'contact_phone_number', 'cityName', 'category', 'biography',
       'address_street', 'MostRecentPostDate', 'AvgLikes', 'AvgComments',
       'user']


def operate(df):
    columns_2 = list(set(df.columns) - set(columns))
    columns_2.append("username")
    df_missing_user = df[df['conscientiousness_prediction'].isna()][columns_2].merge(missing_users_prediction_df[columns], left_on="username", right_on="username",how="left")
    df = pd.concat([
        df[~df['conscientiousness_prediction'].isna()],
        df_missing_user
    ]).drop_duplicates()
    
    return df

In [110]:
df_women_event = operate(df_women_event)
df_women_event.to_csv("liwc_results_350/df_women_event_prediction_350_with_no_post_users.csv")

In [117]:
df_buy_insta_accounts_add_on = operate(df_buy_insta_accounts_add_on)
df_buy_insta_accounts_add_on.to_csv("liwc_results_350/df_buy_insta_accounts_add_on_prediction_350_with_no_post_users.csv")

df_buy_biz = operate(df_buy_biz)
df_buy_biz.to_csv("liwc_results_350/df_buy_biz_prediction_350_with_no_post_users.csv")

df_buy_insta_accounts = operate(df_buy_insta_accounts)
df_buy_insta_accounts.to_csv("liwc_results_350/df_buy_insta_accounts_prediction_350_with_no_post_users.csv")

df_buy_insta_accounts = operate(df_women_event_mined)
df_women_event_mined.to_csv("liwc_results_350/df_women_event_mined_prediction_350_with_no_post_users.csv")