In [3]:
import fasttext as ft
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [4]:
df = pd.read_csv('../data/extrovert_introvert.csv', names=['author_id', 'post', 'extrovert'], header=0)
df.head(10)

Unnamed: 0,author_id,post,extrovert
0,t2_2hrxxs28,"I have a question, if you have no doctor, how'...",0
1,t2_2hrxxs28,butt to have Covid + your cycle. I'm not sure ...,0
2,t2_2hrxxs28,through different doctors. My situation sucks ...,0
3,t2_4pxpgwz,i thought it was about the pebbleyeet guy the ...,0
4,t2_4pxpgwz,…i always end up voting wrong even as crewmate...,0
5,t2_4pxpgwz,made me feel a lot better. ooh yikes half of t...,0
6,t2_4pxpgwz,"your mouth, you’d have a panic attack whenever...",0
7,t2_4pxpgwz,i didnt read the top half bc it was cropped of...,0
8,t2_4pxpgwz,"too hot? can’t do much, either strip nude or R...",0
9,t2_4pxpgwz,"otherwise, though, the “needing” masturbation ...",0


In [5]:
train, test = train_test_split(df, test_size=0.2)

In [6]:
with open('../data/test_fasttext/train.txt', 'w+', encoding='utf-8') as f:
    for index, row in train.iterrows():
        label = '__label__' + str(row['extrovert'])
        post = row['post']
        author_id = row['author_id']
        f.write(f'{label} {post} {author_id}\n')
    f.close()
    
with open('../data/test_fasttext/test.txt', 'w+', encoding='utf-8') as f:
    for index, row in test.iterrows():
        label = '__label__' + str(row['extrovert'])
        post = row['post']
        author_id = row['author_id']
        f.write(f'{label} {post} {author_id}\n')
    f.close()

In [7]:
train = False
# Running this takes 1-2 minutes, so im not adding model.bin to Git

if train:
    model = ft.train_supervised('../data/test_fasttext/train.txt',
                                epoch=10,
                                lr=0.1,
                                wordNgrams=2)
    model.save_model('../data/test_fasttext/model.bin')
else:
    model = ft.load_model('../data/test_fasttext/model.bin')

In [8]:
test_labels = []
predicted_labels = []

with open('../data/test_fasttext/test.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(' ')
        test_labels.append(int(parts[0].replace('__label__', '')))
        post_text = ' '.join(parts[1:-1])
        author_id = parts[-1]
        prediction = model.predict(f'{post_text} {author_id}')[0][0].replace('__label__', '')
        predicted_labels.append(int(prediction))

In [9]:
# out = precision_recall_fscore_support(test_labels, predicted_labels)

precision, recall, fscore, support = precision_recall_fscore_support(test_labels, predicted_labels)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.8098434  0.81707317]
recall: [0.98558616 0.21765024]
fscore: [0.88911363 0.34373664]
support: [6244 1847]


The values are too high to be true, gonna try something else!

### Trying some other papers
[from this lovely page](https://neptune.ai/blog/vectorization-techniques-in-nlp-guide)

In [10]:
df = pd.read_csv('../data/cleaned_extrovert.csv', engine='pyarrow')
train, test = train_test_split(df, train_size=0.8)

In [12]:
all_texts = train['post'].tolist()
all_labels = train['label'].tolist()
prep_datapoints = []
for i in range(len(all_texts)):
    sample = '__label__' + str(all_labels[i]) + ' '+ all_texts[i]
    prep_datapoints.append(sample)

In [13]:
with open('../data/test_fasttext/train_fasttext.txt', 'w', encoding='utf-8') as f:
    for datapoint in prep_datapoints:
        f.write(datapoint)
        f.write('n')
    f.close()

In [14]:
model = ft.train_supervised('../data/test_fasttext/train_fasttext.txt')

In [20]:
model.predict('i love dogs! and expressing my opinions online tot total strangers!')

(('__label__0',), array([1.00001001]))

In [25]:
model.predict(all_texts[-1])

(('__label__0',), array([1.00001001]))