In [22]:
import pandas as pd
import pathlib
import sys
from sklearn.model_selection import train_test_split
import numpy as np

path = str(pathlib.Path().resolve().parent)
sys.path.append(path)
sys.path.append(path+'/src/')

from src.fasttext_model import (
    process_csv,
    train_model,
    get_model_accuracy,
    predict_model
)

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data = pd.read_csv(path + '/data/final_merged_dataset.csv',index_col = 0)

In [4]:
train_df, val_df = train_test_split(data,test_size=0.1,random_state=1)

In [5]:
# Check label distribution in validation set:
val_df.groupby('label').count()

Unnamed: 0_level_0,tweet
label,Unnamed: 1_level_1
0,1461
1,1740


In [6]:
train_file_path = process_csv(train_df,'train',path + '/data')
val_file_path = process_csv(val_df,'val',path + '/data')

Check params here : https://fasttext.cc/docs/en/python-module.html#train_supervised-parameters

In [7]:
model = train_model(train_file_path,lr=0.1, epoch=20)

In [8]:
train_acc = get_model_accuracy(model,train_file_path)
val_acc = get_model_accuracy(model,val_file_path)
print(f"Training accuracy : {100*train_acc:.2f} %")
print(f"Validation accuracy : {100*val_acc:.2f} %")

Training accuracy : 99.99 %
Validation accuracy : 89.25 %


Testing the model with hand-made examples :

In [13]:
predict_model(model,['Covid-19 was created in a lab in China'])

[0]

In [44]:
predict_model(model,['Covid-19 : 10000 new cases today'])

[1]

In [45]:
predict_model(model,['Donald Trump said : "Covid-19 has caused many deaths so far"'])

[0]

Interesting !

In [16]:
predict_model(model,['Bill Gates has just launched his 5G covid chips program'])

[0]

In [24]:
val_text = list(val_df['tweet'].values)
val_labels = list(val_df['label'].values)
val_pred = predict_model(model,val_text)

In [37]:
diff = np.array(val_labels) - np.array(val_pred)
idxs = np.where(diff != 0)[0]

Misclassified examples :

In [41]:
for k in range(10):
    print(f"Tweet {k+1} \n: {val_df.iloc[idxs[k]]['tweet']}")

Tweet 1 
: CORONAVIRUS BY THE NUMBERS IN MICHIGAN(*as of 4pm May 1):
77 NEW DEATHS LINKED TO COVID-19.
977 NEWLY CONFIRMED CASES STATEWIDE.
42,356-TOTAL CONFIRMED  CASES.
3,866 TOTAL COVID-19 DEATHS. https://t.co/vboEDKtqY8
Tweet 2 
: The consumption of pig causes COVID-19 entry into Indonesia.
Tweet 3 
: #ZuckOff #TakeItFromADoctor and a video with false claims on mask wearing and #hydroxychloroquine. https://t.co/0WH13B5iqT https://t.co/04eETBikNd
Tweet 4 
: Seriously??? This was 3 yrs ago and we all have seen this already. And V said his fave hyung is jin. So dont you ever spread rumors about the relationship of jin and tae..  Soo stop spreading news like as if it was new. Screw you koreaboo https://t.co/2YL2mHsRdD
Tweet 5 
: @geoallison Today's official figures.

A recent BMJ review used a False Positive rate of 5%.

Public Health England came up with slightly under 2%.

A German study found false positive rates rose from 1.4% to 2.2%-7.6% when a Hcov was present (alpha and betacor