In [8]:
from datasets import Dataset
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
import torch
import pickle

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
def preprocess(examples, focused_trait="conscientiousness"):
    label = examples[focused_trait]
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    examples['label'] = float(label)
    return examples



def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)    
    
    return {"mse": mse, "mae": mae, "r2": r2}



class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']

# using disilroberta as a quick baseline model
# https://huggingface.co/docs/transformers/tasks/sequence_classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)


for focused_trait in traits:
    print(f"start working on f{focused_trait}")

    
    training_feature_df = pd.read_parquet("liwc_training_dataset.parquet")
    ds = Dataset.from_dict(training_feature_df.to_dict('list'))
    tokenized_ds = ds.map(preprocess, fn_kwargs={"focused_trait": focused_trait}, remove_columns=['post_count','word_count','user'])
    train_test = tokenized_ds.train_test_split(test_size=0.3, seed=42)
    test_eval = train_test['test'].train_test_split(test_size=0.5, seed=42)


    LEARNING_RATE = 2e-5
    MAX_LENGTH = 256
    BATCH_SIZE = 16
    EPOCHS = 20

    training_args = TrainingArguments(
        output_dir=f"../models/fine-tuned-regression-{focused_trait}",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        metric_for_best_model="r2",
        load_best_model_at_end=True,
        weight_decay=0.01,
    )


    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=train_test["train"],
        eval_dataset=test_eval["train"],
        compute_metrics=compute_metrics_for_regression,
    )

    trainer.train()

    trainer.eval_dataset=test_eval["test"]
    eval_ret = trainer.evaluate()

    # save dictionary to person_data.pkl file
    with open(f'../models/fine-tuned-regression-{focused_trait}/eval_result.pkl', 'wb') as fp:
        pickle.dump(eval_ret, fp)

In [20]:
import pickle
import pandas as pd
keys = eval_ret.keys()
d = {}
for k in keys:
    d[k]=[]
d['trait'] = []

traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']
for focused_trait in traits:
        with open(f'../models/fine-tuned-regression-{focused_trait}/eval_result.pkl', 'rb') as fp:
                eval_ret = pickle.load(fp)
                for k in eval_ret.keys():
                        d[k].append(eval_ret[k])
                d['trait'].append(focused_trait)
        

In [22]:
pd.DataFrame(d).to_csv("eval_result.csv")

In [3]:
from collections import defaultdict
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

# using disilroberta as a quick baseline model
# https://huggingface.co/docs/transformers/tasks/sequence_classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)


def preprocess_for_inference_dataset(examples):
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    return examples


all_user_text_df = pd.read_parquet("liwc_dataset_all_16_word_count_per_user_max_120.parquet")
ds_inference = Dataset.from_dict(all_user_text_df.to_dict('list'))
tokenized_ds_inference = ds_inference.map(preprocess_for_inference_dataset, remove_columns=['post_count','word_count','user'])

Map: 100%|██████████| 14287/14287 [00:04<00:00, 2875.65 examples/s]


In [61]:
# loading the model previously trained
model_paths = [
    "/Users/bohaocao/Codebase/uw-ocean/models/fine-tuned-regression-agreeableness/checkpoint-1300",
    "/Users/bohaocao/Codebase/uw-ocean/models/fine-tuned-regression-conscientiousness/checkpoint-1400",
    "/Users/bohaocao/Codebase/uw-ocean/models/fine-tuned-regression-extraversion/checkpoint-1200",
    "/Users/bohaocao/Codebase/uw-ocean/models/fine-tuned-regression-neuroticism/checkpoint-1600",
    "/Users/bohaocao/Codebase/uw-ocean/models/fine-tuned-regression-openness/checkpoint-1000"]
d = {}

for model_path in model_paths[:1]:
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    trait = model_path.split(sep="/")[-2].split(sep="-")[-1]
    # arguments for Trainer
    test_args = TrainingArguments(
        output_dir = model_path,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 50,   
        dataloader_drop_last = False    
    )

    # init trainer
    trainer = Trainer(
        model = model, 
        args = test_args, 
        compute_metrics = compute_metrics_for_regression
    )

    predictions = trainer.predict(tokenized_ds_inference)

    predictions_df = pd.DataFrame(predictions.predictions)
    d[trait]=predictions_df[0].tolist()

100%|██████████| 286/286 [04:26<00:00,  1.07it/s]


In [62]:
predictions_df = pd.DataFrame(d)

In [63]:
predictions_df

Unnamed: 0,agreeableness
0,60.239491
1,65.638634
2,56.532459
3,50.052540
4,38.488125
...,...
14282,62.034092
14283,53.700184
14284,59.968418
14285,55.035839


In [64]:
user_text_prediction_df = user_text_prediction_df.merge(predictions_df,left_index=True, right_index=True)

In [59]:
user_text_prediction_df = all_user_text_df.merge(all_predictions_df,left_index=True, right_index=True)

In [66]:
user_text_prediction_df.to_csv("all_predictions.csv")

In [74]:
# Convert the 0-100 numeric score into 1-10 categories.
traits = ['conscientiousness', 'openness', 'neuroticism','agreeableness', 'extraversion']
for trait in traits:
    user_text_prediction_df[f'{trait}_prediction'] = pd.cut(user_text_prediction_df[trait], 10, labels=[n for n in range(1, 11)])
#pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

In [83]:
user_text_prediction_df

Unnamed: 0,user,text,post_count,word_count,conscientiousness,extraversion,neuroticism,openness,agreeableness,conscientiousness_prediction,openness_prediction,neuroticism_prediction,agreeableness_prediction,extraversion_prediction
0,ariannyceleste,Raidens Mom 👶🏽🐶🐶🧿 Founder @girlfriendbox @them...,25,105,57.999210,53.876961,37.427265,44.819927,60.239491,10,6,5,8,6
1,sarahstage,Boy Mom 👶🏻👦🏻 Wife 💃🏻 & Online Fitness Coach 💪🏼...,25,81,54.965519,65.441742,34.057461,47.052925,65.638634,9,6,4,9,8
2,beauty_nurse_elizabeth,Aesthetic RN-BSN •Injection Artist 🌹 •Natural ...,25,54,50.392796,55.129974,30.115822,51.914841,56.532459,8,8,3,7,6
3,carlyrbel,One of a kind ✨ • @carlybelx • shop @carlyclub...,25,104,45.541168,47.300446,39.236385,41.314716,50.052540,7,4,5,6,4
4,biolayne,PhD Nutrition Science 💍@hollytbaxter @carbondi...,25,31,38.028465,46.601849,42.697262,43.083668,38.488125,4,5,6,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14282,lianev,Business inquiries Jennifer@thejgoagency.com T...,25,92,38.893402,61.137104,28.089752,45.107208,62.034092,5,6,3,8,7
14283,kuz,Vino @lakers inquiries@kylekuzma.com @puma Ath...,25,95,48.400639,54.234875,36.810524,43.153072,53.700184,7,5,5,6,6
14284,tanamongeau,BLACK. LIVES. MATTER. LINK IN BIO TO HELP; @th...,25,101,53.021297,61.984756,37.032635,45.033176,59.968418,9,6,5,8,7
14285,lilyachty,no stylist; Itâ€™s us!; no stylist; It’s us!;;...,10,73,39.469841,52.406597,29.893835,37.300476,55.035839,5,3,3,7,5


In [101]:
import os 
import pandas as pd

files = os.listdir("../data")
files.sort()

df_women_event = pd.read_csv(os.path.join("../data", files[0]), sep="\t")
df_buy_insta_accounts_add_on = pd.read_csv(os.path.join("../data", files[1]), sep="\t")
df_buy_biz = pd.read_csv(os.path.join("../data", files[2]), sep="\t")
df_buy_insta_accounts = pd.read_csv(os.path.join("../data", files[3]), sep="\t")
df_women_event_mined = pd.read_csv(os.path.join("../data", files[4]), sep="\t")

In [82]:
columns = ['user','conscientiousness_prediction', 'openness_prediction', 'neuroticism_prediction','agreeableness_prediction', 'extraversion_prediction']
user_text_prediction_df.rename(columns={
    "conscientiousness_categorized":"conscientiousness_prediction",
    "openness_categorized":"openness_prediction",
    "neuroticism_categorized":"neuroticism_prediction",
    "agreeableness_categorized":"agreeableness_prediction",
    "extraversion_categorized":"extraversion_prediction"},inplace=True)


TypeError: DataFrame.merge() missing 1 required positional argument: 'right'

In [102]:
user_text_prediction_df.drop_duplicates(inplace=True)

In [103]:
df_women_event = df_women_event.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [104]:
df_buy_insta_accounts_add_on=df_buy_insta_accounts_add_on.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [106]:
df_buy_biz=df_buy_biz.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [107]:
df_buy_biz

Unnamed: 0,username,Openness,Conscientiousness,Extroversion,Agreeableness,Neuroticism,Type,media_count,full_name,following_count,...,address_street,MostRecentPostDate,AvgLikes,AvgComments,user,conscientiousness_prediction,openness_prediction,neuroticism_prediction,agreeableness_prediction,extraversion_prediction
0,goodeatzco,Low,High,High,Vague,Vague,Blog,1693,#GoodEatzCo | Andy,145,...,,2022-01-04 20:52:16,5407.11,41.17,goodeatzco,4,3,3,6,3
1,lexxalynn,,,,,,,241,Alexa Ditchburn,999,...,,,,,,,,,,
2,a1republic,High,High,High,High,Low,Business,30,A1 REPUBLIC ™,613,...,,2021-03-30 19:31:57,5859.94,73.22,a1republic,9,6,3,7,8
3,keychainsocial,High,High,High,High,Low,Business,214,"Keychain Social, Inc.",0,...,,2020-01-22 9:56:45,19273.17,,keychainsocial,7,8,6,6,6
4,thegiftshoppe.nft,Vague,Low,Low,Vague,Vague,Business,9,the gift shoppe,5,...,,2021-02-16 23:17:40,1451.00,55.89,thegiftshoppe.nft,3,2,3,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,irose.social,High,High,High,Low,Low,Business,338,IROSE,422,...,,2022-01-03 22:26:32,24.33,7.67,irose.social,9,6,3,7,8
140,nattcity,High,Low,High,Low,Low,Personal,1129,nathalie,686,...,,2022-01-03 22:13:06,535.39,51.17,nattcity,10,7,4,8,8
141,jondennill,High,High,High,High,Low,Personal,124,Jonathan Dennill,733,...,,2021-12-26 18:30:05,47.17,2.94,,,,,,
142,igmodels_co,Low,High,Vague,Vague,Vague,Blog,458,IG Models Worldwide ®,158,...,,2021-09-26 13:33:30,324.56,1.72,,,,,,


In [108]:
df_buy_insta_accounts = df_buy_insta_accounts.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [109]:
df_women_event_mined = df_women_event_mined.merge(user_text_prediction_df[columns], left_on="username", right_on="user",how="left")

In [111]:
df_women_event.to_csv("df_women_event_prediction.csv",index=False)
df_buy_insta_accounts_add_on.to_csv("df_buy_insta_accounts_add_on_prediction.csv",index=False)
df_buy_biz.to_csv("df_buy_biz_prediction.csv",index=False)
df_buy_insta_accounts.to_csv("df_buy_insta_accounts_prediction.csv",index=False)
df_women_event_mined.to_csv("df_women_event_mined_prediction.csv",index=False)
