In [1]:
import pandas as pd
import pathlib
import sys
from sklearn.model_selection import train_test_split
import numpy as np

path = str(pathlib.Path().resolve().parent)
sys.path.append(path)
sys.path.append(path+'/src/')

from src.fasttext_model import (
    process_csv,
    train_model,
    get_model_accuracy,
    predict_model,
    load_model_from_path
)
from src.scraper import get_today_tweets

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv(path + '/data/final_merged_dataset.csv',index_col = 0)

In [4]:
train_df, val_df = train_test_split(data,test_size=0.1,random_state=1)

In [5]:
# Check label distribution in validation set:
val_df.groupby('label').count()

Unnamed: 0_level_0,tweet
label,Unnamed: 1_level_1
0,1461
1,1740


In [6]:
train_file_path = process_csv(train_df,'train',path + '/data')
val_file_path = process_csv(val_df,'val',path + '/data')

Check params here : https://fasttext.cc/docs/en/python-module.html#train_supervised-parameters

In [9]:
model = train_model(
    train_file_path,
    lr=0.1,
    epoch=10,
    wordNgrams=3,
    )

In [12]:
train_acc = get_model_accuracy(model,train_file_path)
val_acc = get_model_accuracy(model,val_file_path)
print(f"Training accuracy : {100*train_acc:.2f} %")
print(f"Validation accuracy : {100*val_acc:.2f} %")

Training accuracy : 100.00 %
Validation accuracy : 89.85 %


In [9]:
model.save_model(path + "/models/fasttext.bin")

In [11]:
model = load_model_from_path(path + "/models/fasttext.bin")



Testing the model with hand-made examples :

In [11]:
predict_model(model,['Covid-19 was created in a lab in China'])

[0]

In [12]:
predict_model(model,['Covid-19 : 10000 new cases today'])

[1]

In [40]:
predict_model(model,['Donald Trump said : "Covid-19 has caused many deaths so far"'])

[0]

Interesting !

In [41]:
predict_model(model,['Bill Gates has just launched his 5G covid chips program'])

[0]

In [14]:
today_tweets = get_today_tweets(
    "COVID19",
    max_results=10
    )
for tweet in today_tweets:
    print(f"Created at : {tweet['created_at']}\n")
    tweet = tweet['text']
    print(f"Tweet : {tweet}")
    print(f"Prediction : {predict_model(model,[tweet])}")

Created at : 2022-03-24T07:25:40.000Z

Tweet : The truth is: Fort Detrick is the place where the COVID-19 originated.
 #COVID19
#COVID-19
#COVID
#Omicron
https://t.co/yevZ6Osm6j
Prediction : [1]
Created at : 2022-03-24T07:25:24.000Z

Tweet : The truth is: Fort Detrick is the place where the COVID-19 originated.
  #COVID19
#COVID-19
#COVID
#Omicron
https://t.co/hK4skRx6OL
Prediction : [1]
Created at : 2022-03-24T07:25:16.000Z

Tweet : Thursday is finally here. Not long until the weekend! Take care and stay safe. 😷🐼
#WearAMask #coronavirus #covid19 #ThursdayThoughts #PandemicPanda https://t.co/hyiMbiUsiD
Prediction : [0]
Created at : 2022-03-24T07:25:15.000Z

Tweet : Beyond a #covid19 strategy, enjoying the outdoors is a prescription that works, naturally! https://t.co/gZvv0XhGqw
Prediction : [0]
Created at : 2022-03-24T07:25:11.000Z

Tweet : One person with active, untreated #TB can spread the disease to 10-15 people in one year. As resources are diverted to fight #COVID19, more of thes

In [16]:
val_text = list(val_df['tweet'].values)
val_labels = list(val_df['label'].values)
val_pred = predict_model(model,val_text)

In [17]:
diff = np.array(val_labels) - np.array(val_pred)
idxs = np.where(diff != 0)[0]

Misclassified examples :

In [19]:
for k in range(10):
    print(f"Tweet {k+1} \n: {val_df.iloc[idxs[k]]['tweet']}")
    print(f"Original label : {val_labels[idxs[k]]}")
    print(f"Model prediction : {val_pred[idxs[k]]}")
    print("---------------------------------------------")

Tweet 1 
: CORONAVIRUS BY THE NUMBERS IN MICHIGAN(*as of 4pm May 1):
77 NEW DEATHS LINKED TO COVID-19.
977 NEWLY CONFIRMED CASES STATEWIDE.
42,356-TOTAL CONFIRMED  CASES.
3,866 TOTAL COVID-19 DEATHS. https://t.co/vboEDKtqY8
Original label : 1
Model prediction : 0
---------------------------------------------
Tweet 2 
: The consumption of pig causes COVID-19 entry into Indonesia.
Original label : 0
Model prediction : 1
---------------------------------------------
Tweet 3 
: #ZuckOff #TakeItFromADoctor and a video with false claims on mask wearing and #hydroxychloroquine. https://t.co/0WH13B5iqT https://t.co/04eETBikNd
Original label : 1
Model prediction : 0
---------------------------------------------
Tweet 4 
: Seriously??? This was 3 yrs ago and we all have seen this already. And V said his fave hyung is jin. So dont you ever spread rumors about the relationship of jin and tae..  Soo stop spreading news like as if it was new. Screw you koreaboo https://t.co/2YL2mHsRdD
Original label