In [1]:
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

In [2]:
df = pd.read_csv('corona_fake.csv')
df.head()

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake


In [3]:
df.shape

(1164, 4)

* we have 1164 rows and 4 columns

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1164 entries, 0 to 1163
Data columns (total 4 columns):
title     1082 non-null object
text      1154 non-null object
source    1144 non-null object
label     1159 non-null object
dtypes: object(4)
memory usage: 36.5+ KB


In [5]:
df.isnull().sum()

title     82
text      10
source    20
label      5
dtype: int64

In [6]:
df.nunique()

title      973
text      1114
source     269
label        3
dtype: int64

In [7]:
df.label.unique()

array(['Fake', nan, 'TRUE', 'fake'], dtype=object)

* we see that FAKE is written in different forms.

In [8]:
# replace other forms of fake to one unique form 
df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'
df.loc[df['label'] == 'fake', ['label']] = 'FAKE'

In [9]:
df.text.fillna(df.title, inplace=True)

In [10]:
df = df.sample(frac=1).reset_index(drop=True)
df.title.fillna('missing', inplace=True)
df.source.fillna('missing', inplace=True)

In [11]:
df['title_text'] = df['title'] + ' ' + df['text']

In [12]:
df.dropna(inplace=True)

In [13]:
df.label.unique()

array(['TRUE', 'FAKE'], dtype=object)

In [14]:
df.head()

Unnamed: 0,title,text,source,label,title_text
0,What Is Coronavirus?,COVID-19 is the disease caused by the new coro...,https://www.hopkinsmedicine.org/,TRUE,What Is Coronavirus? COVID-19 is the disease c...
1,COVID-19 Coronavirus: A False Pandemic? Who is...,The hype and disinformation campaign about the...,https://www.mondialisation.ca/,FAKE,COVID-19 Coronavirus: A False Pandemic? Who is...
2,Fauci knew about HCQ in 2005 -- nobody needed ...,"Dr. Anthony Fauci, whose “expert” advice to Pr...",https://onenewsnow.com/,FAKE,Fauci knew about HCQ in 2005 -- nobody needed ...
3,"What is cryptic transmission, and what is its ...",Cryptic transmission is the term that is used ...,https://www.globalhealthnow.org/,TRUE,"What is cryptic transmission, and what is its ..."
4,This review outlines Traditional Chinese Medic...,Since the outbreak of 2019 novel coronavirus i...,https://web.archive.org/,FAKE,This review outlines Traditional Chinese Medic...


In [15]:
df['label'].value_counts()

TRUE    584
FAKE    575
Name: label, dtype: int64

In [16]:
df['title_text'][50]

'COVID-19 and the CIA’s Biological Warfare on Cuba Maybe it was a plan that went horribly wrong, something they could no longer control. Was the Corona virus or COVID-19 spread intentionally? What if this virus was used against China as a weapon of choice to destabilize China’s economy and push back against China’s growing influence? We don’t know for sure, but it is possible. Investigations are ongoing. Nothing has not been confirmed.But what has been confirmed is what history has taught us given the facts on how the use of biological warfare for various purposes, against many peoples and nations has been happening for some time. One of the most well-known incidents of biological warfare occurred in 1763, the British Empire had planned and successfully managed to spread smallpox virus to the Native Americans during the Pontiac Rebellion in Pennsylvania. Chief Pontiac of the Ottawa launched an attack on Fort Detroit, a British military base. Other nations joined the rebellion including

In [17]:
def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()

    return text

In [18]:
df['title_text'] = df['title_text'].apply(preprocessor)

In [19]:
df['title_text'][50]

'covid19 and the cias biological warfare on cuba maybe it was a plan that went horribly wrong something they could no longer control was the corona virus or covid19 spread intentionally what if this virus was used against china as a weapon of choice to destabilize chinas economy and push back against chinas growing influence we dont know for sure but it is possible investigations are ongoing nothing has not been confirmedbut what has been confirmed is what history has taught us given the facts on how the use of biological warfare for various purposes against many peoples and nations has been happening for some time one of the most wellknown incidents of biological warfare occurred in 1763 the british empire had planned and successfully managed to spread smallpox virus to the native americans during the pontiac rebellion in pennsylvania chief pontiac of the ottawa launched an attack on fort detroit a british military base other nations joined the rebellion including the senecas the huro

In [20]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [21]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
y = df.label.values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)

lr = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.1s remaining:   43.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.1s finished


In [23]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = lr.predict(X_test)
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

Accuracy: 0.9172413793103448
              precision    recall  f1-score   support

        FAKE       0.92      0.92      0.92       286
        TRUE       0.92      0.92      0.92       294

    accuracy                           0.92       580
   macro avg       0.92      0.92      0.92       580
weighted avg       0.92      0.92      0.92       580

