In [52]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import re

In [56]:
### Data cleaning

# Import data, add target classification, merge dataset, add label
fake = pd.read_csv("data/fake.csv", usecols= ["text"])
true = pd.read_csv("data/true.csv", usecols= ["text"])
fake['target'] = 0
true['target'] = 1

# Merge fake and true articles into one dataset 
data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])

Example of the dataset `data` consisting of both labels(1 is True, 0 is Fake):

In [57]:
data

Unnamed: 0,text,target
0,Will the new policy for face-to-face interview...,0
1,The Trump administration is working as hard as...,0
2,WASHINGTON (Reuters) - The chairman of the U.S...,1
3,How unfunny! Does this not tell you something ...,0
4,"Meanwhile, as President Trump continues to mee...",0
...,...,...
44893,"During an interview, Steve Bannon who is the ...",0
44894,PHNOM PENH (Reuters) - Cambodia has canceled t...,1
44895,SEATTLE (Reuters) - Boeing Co and about 90 oth...,1
44896,ROME (Reuters) - Sicilian police on Wednesday ...,1


In [41]:
# Split the data into two parts: training data (7/10) and other data (3/10)
train_text, val_test_text = train_test_split(data, random_state=1234, test_size=0.3, stratify=data['target'])

# Split other data into two parts: validation data (1/3 * 3/10 = 1/10) and testing data (2/3 * 3/10 = 2/10)
val_text, test_text = train_test_split(val_test_text, random_state=1234, test_size=0.6, stratify=val_test_text['target'])

Example of the `train_text` data frame:

In [42]:
train_text

Unnamed: 0,text,target
22062,WASHINGTON (Reuters) - U.S. Treasury Secretary...,1
29655,THIS IS EXPECTED FROM THIS LEFTY COLLEGE Will ...,0
15949,A laptop computer containing floor plans for T...,0
21126,WASHINGTON (Reuters) - U.S. Defense Secretary ...,1
12484,WASHINGTON (Reuters) - U.S.-backed militias fi...,1
...,...,...
11053,,0
2929,Women in Saudi Arabia were just given the righ...,0
40340,NEW YORK (Reuters) - The top contenders in bo...,1
33532,A very pregnant Kerry Washington appeared on R...,0


For our baseline model, we will be using the `TF-IDF` Vectorizer to pre-process articles and then apply Logistic Classifier.

- **fit_transform()** method learns vocabulary and `IDF` used for both training & test data. Returns document-term matrix with calculated `TF-IDF` values.

- **transform()** method uses the vocabulary and document frequencies (df) learned by **fit_transform()**. Returns document-term matrix with calculated `TF-IDF` values.

In [43]:
# Note, ngrams = 1, which is the default value if not specified in TfidfVectorizer. 
text_transformer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=150000)

# vectorize train and test data. Produce TF-IDF for train data
X_train_text = text_transformer.fit_transform(train_text['text'])
X_val_text = text_transformer.transform(val_text['text'])
X_test_text = text_transformer.transform(test_text['text'])

Below is the example of the stop words used in TfidfVectorizer that will be filtered out from our observations (i.e. articles), both 'training' and 'test':

In [44]:
text_transformer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [71]:
feature_names = text_transformer.get_feature_names_out()

In [45]:
print('The number of observations (articles) in  the test data: ', X_train_text.shape[0])
print('The number of features (tokens) in  the test data: ', X_train_text.shape[1])

The number of observations (articles) in  the test data:  31428
The number of features (tokens) in  the test data:  105287


Example of `TF-IDF` matrix, **val_text**, for the validation dataset:

In [46]:
X_val_text.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.02196155, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

We will be using the `Logistic Classifier` as our baseline model for training:

In [47]:
logit = LogisticRegression(penalty = 'l2', C = 1, solver= 'sag', multi_class = 'multinomial')
logit.fit(X_train_text, train_text['target'])

LogisticRegression(C=1, multi_class='multinomial', solver='sag')

Now that we have trained our model, we will apply it to predict labels (true/false) for articles in the test data and calculate the accuracy score:

In [48]:
test_predicted_label = logit.predict(X_test_text)
score = accuracy_score(test_text['target'], test_predicted_label)

print('the accuracy score on the testing data is: ', score)

the accuracy score on the testing data is:  0.987750556792873


**Future steps:**

- Continue cleaning data with the use of Regex and other packages (digits, punctation, 'Router', '21st century')

- Further analysis of data

- Re-run model after data is cleaned

- Discover options to improve the model