In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
#import spacy
import re

ModuleNotFoundError: No module named 'spacy'

In [177]:
### Data cleaning

# Import data, add target classification, merge dataset, add label
fake = pd.read_csv("data/fake.csv", usecols= ["text"])
true = pd.read_csv("data/true.csv", usecols= ["text"])
fake['target'] = 0
true['target'] = 1

#remove newspaper source (Reuters) from the "true" articles before merging
true['text'] = true['text'].replace(r'\A.*\(Reuters\)', '', regex=True) 

# Merge fake and true articles into one dataset 
data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])

Example of the dataset `data` consisting of both labels(1 is True, 0 is Fake):

In [178]:
data

Unnamed: 0,text,target
0,Robert Parry Consortium NewsIf there were any...,0
1,While Bernie Sanders is nowhere near ready to ...,0
2,- Life has stopped in its tracks in Myanmar s...,1
3,"On Tuesday night, Ted Cruz dropped out of the ...",0
4,- Britain is very close to reaching an agreem...,1
...,...,...
44893,"If you ve spent much time on social media, you...",0
44894,- The number of asylum seekers walking across...,1
44895,- Turkey s talks with the United States over ...,1
44896,Tune in to the Alternate Current Radio Network...,0


In [179]:
# Split the data into two parts: training data (7/10) and other data (3/10)
train_text, val_test_text = train_test_split(data, random_state=1234, test_size=0.3, stratify=data['target'])

# Split other data into two parts: validation data (1/3 * 3/10 = 1/10) and testing data (2/3 * 3/10 = 2/10)
val_text, test_text = train_test_split(val_test_text, random_state=1234, test_size=0.6, stratify=val_test_text['target'])

Example of the `train_text` data frame:

In [180]:
train_text

Unnamed: 0,text,target
22256,- When more than 7.4 million homes and busine...,1
29536,"Yes, he did say that bad grammar and all! Can ...",0
15851,Rachel Maddow decided to scrap a segment and g...,0
21257,- U.S.-backed Syrian militias will not let go...,1
12532,- Support for the creation of an independent ...,1
...,...,...
10984,"It s just cleaner that way You know, keeping a...",0
2934,Why would the Vatican invite an aggressive lef...,0
40321,- Polish Prime Minister Beata Szydlo said on ...,1
33504,Washington was rocked by yet another staff sha...,0


For our baseline model, we will be using the `TF-IDF` Vectorizer to pre-process articles and then apply Logistic Classifier.

- **fit_transform()** method learns vocabulary and `IDF` used for both training & test data. Returns document-term matrix with calculated `TF-IDF` values.

- **transform()** method uses the vocabulary and document frequencies (df) learned by **fit_transform()**. Returns document-term matrix with calculated `TF-IDF` values.

In [181]:
# Note, ngrams = 1, which is the default value if not specified in TfidfVectorizer. 
text_transformer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=1000)

# vectorize train and test data. Produce TF-IDF for train data
X_train_text = text_transformer.fit_transform(train_text['text'])
X_val_text = text_transformer.transform(val_text['text'])
X_test_text = text_transformer.transform(test_text['text'])

Below is the example of the stop words used in TfidfVectorizer that will be filtered out from our observations (i.e. articles), both 'training' and 'test':

In [182]:
text_transformer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [183]:
feature_names = text_transformer.get_feature_names_out()
feature_names

array(['000', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '20', '2008', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '21st', '21wire', '22', '24', '25', '28', '30', '50',
       'able', 'abortion', 'absolutely', 'access', 'according', 'account',
       'accused', 'act', 'action', 'actions', 'actually', 'added',
       'adding', 'address', 'administration', 'adviser', 'afghanistan',
       'african', 'agencies', 'agency', 'agenda', 'agents', 'ago',
       'agreed', 'agreement', 'ahead', 'aid', 'air', 'al', 'allegations',
       'alleged', 'allies', 'allow', 'allowed', 'ambassador', 'amendment',
       'america', 'american', 'americans', 'announced', 'answer', 'anti',
       'apparently', 'appeared', 'appears', 'april', 'arabia', 'area',
       'areas', 'armed', 'army', 'arrested', 'article', 'ask', 'asked',
       'asking', 'assault', 'attack', 'attacks', 'attempt', 'attention',
       'attorney', 'august', 'authorities', 'authority', 'awa

In [184]:
print('The number of observations (articles) in  the test data: ', X_train_text.shape[0])
print('The number of features (tokens) in  the test data: ', X_train_text.shape[1])

The number of observations (articles) in  the test data:  31428
The number of features (tokens) in  the test data:  1000


Example of `TF-IDF` matrix, **val_text**, for the validation dataset:

In [185]:
X_val_text.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.02782728, 0.03085852, 0.        , ..., 0.        , 0.        ,
         0.08138153],
        [0.        , 0.        , 0.        , ..., 0.        , 0.07077367,
         0.        ]])

We will be using the `Logistic Classifier` as our baseline model for training:

In [186]:
logit = LogisticRegression(penalty = 'l2', C = 1, solver= 'sag', multi_class = 'multinomial')
logit.fit(X_train_text, train_text['target'])

LogisticRegression(C=1, multi_class='multinomial', solver='sag')

Now that we have trained our model, we will apply it to predict labels (true/false) for articles in the test data and calculate the accuracy score:

In [187]:
train_predicted_label = logit.predict(X_train_text)
train_accuracy_score = accuracy_score(train_text['target'], train_predicted_label)

predicted_label = logit.predict(X_val_text)
accuracy_score = accuracy_score(val_text['target'], predicted_label)

print('the accuracy score on the training data is: ', train_accuracy_score)
print('the accuracy score on the validation data is: ', accuracy_score)

the accuracy score on the training data is:  0.9772814051164567
the accuracy score on the validation data is:  0.9699331848552338


**Future steps:**

- Continue cleaning data with the use of Regex and other packages (digits, punctation, 'Router', '21st century')

- Further analysis of data

- Re-run model after data is cleaned

- Discover options to improve the model

Notes form meeting with Cole:

try different models with different # number of features 

1. spacy for steming/punctation 
2. remove article sources 
3. try different # of features 

parced list of words

analysis: size of trainings 
distribution of words 