In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [4]:
### Data cleaning

# Import data, add target classification, merge dataset, add label
fake = pd.read_csv("data/fake.csv", usecols= ["text"])
true = pd.read_csv("data/true.csv", usecols= ["text"])
fake['target'] = 0
true['target'] = 1

# Merge fake and true articles into one dataset 
data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])

Example of the dataset `data` consisting of both labels(1 is True, 0 is Fake):

In [5]:
data

Unnamed: 0,text,target
0,WASHINGTON (Reuters) - U.S. President Donald T...,1
1,WASHINGTON (Reuters) - House of Representative...,1
2,Donald Trump sent a racist elf to make the ann...,0
3,Russiagate. First it was Russian hacking our e...,0
4,Former NBC anchor Matt Lauer claims he s plann...,0
...,...,...
44893,It s hard to be famous for being a victim in...,0
44894,GENEVA (Reuters) - The United States wants to ...,1
44895,"Manufactured protests Hillary, George Soros, B...",0
44896,"NRA board member, draft-dodger, pedophile, and...",0


In [21]:
# Split the data into two parts: training data (7/10) and other data (3/10)
train_text, val_test_text = train_test_split(data, random_state=1234, test_size=0.3, stratify=data['target'])

# Split other data into two parts: validation data (1/3 * 3/10 = 1/10) and testing data (2/3 * 3/10 = 2/10)
val_text, test_text = train_test_split(val_test_text, random_state=1234, test_size=0.6, stratify=val_test_text['target'])

Example of the `train_text` data frame:

In [18]:
train_text

Unnamed: 0,text,target
22197,"DENMARK, South Carolina (Reuters) - Betty Odom...",1
29505,Bee stings can have an impact ranging from mil...,0
15794,Wow! We REALLY do have a serious problem with ...,0
21271,(Reuters) - The Puerto Rico senate has approve...,1
12654,WASHINGTON (Reuters) - Less than 24 hours afte...,1
...,...,...
10909,Donald Trump is so funny! He was speaking at a...,0
2917,Democrat Senator and fake Indian Elizabeth War...,0
40404,SEOUL (Reuters) - South Korean President Moon ...,1
33477,Having lived through an assassination attempt ...,0


For our baseline model, we will be using the `TF-IDF` Vectorizer to pre-process articles and then apply Logistic Classifier.

- **fit_transform()** method learns vocabulary and `IDF` used for both training & test data. Returns document-term matrix with calculated `TF-IDF` values.

- **transform()** method uses the vocabulary and document frequencies (df) learned by **fit_transform()**. Returns document-term matrix with calculated `TF-IDF` values.

In [22]:
# Note, ngrams = 1, which is the default value if not specified in TfidfVectorizer. 
text_transformer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=150000)

# vectorize train and test data. Produce TF-IDF for train data
X_train_text = text_transformer.fit_transform(train_text['text'])
X_val_text = text_transformer.transform(val_text['text'])
X_test_text = text_transformer.transform(test_text['text'])

Below is the example of the stop words used in TfidfVectorizer that will be filtered out from our observations (i.e. articles), both 'training' and 'test':

In [19]:
text_transformer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [18]:
print('The number of observations (articles) in  the test data: ', X_train_text.shape[0])
print('The number of features (tokens) in  the test data: ', X_train_text.shape[1])

The number of observations (articles) in  the test data:  31428
The number of features (tokens) in  the test data:  105216


Example of `TF-IDF` matrix, **val_text**, for the validation dataset:

In [10]:
val_text.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.00347043, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.07081219, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

We will be using the `Logistic Classifier` as our baseline model for training:

In [11]:
logit = LogisticRegression(penalty = 'l2', C = 1, solver= 'sag', multi_class = 'multinomial')
logit.fit(X_train_text, train_text['target'])

LogisticRegression(C=1, multi_class='multinomial', solver='sag')

Now that we have trained our model, we will apply it to predict labels (true/false) for articles in the test data and calculate the accuracy score:

In [23]:
test_predicted_label = logit.predict(X_test_text)
score = accuracy_score(test_text['target'], test_predicted_label)

print('the accuracy score on the testing data is: ', score)