In [1]:
## Based on NLP series by WomenWhoCode: 
#  - https://github.com/WomenWhoCode/WWCodeDataScience/blob/master/Intro_to_NLP/5_NLP_SupervisedTextClassification.ipynb
## Uses data on kaggale at:
#  - https://www.kaggle.com/c/fake-news/data
## Based on Biplav's Spring 2021 couse's sample code
#  - https://github.com/biplav-s/course-nl/blob/8f0bb9e50db6706595e6d5ca38c39d31e9bfc77b/l9-ml-review/Classification%20-%20Fake%20news.ipynb

# Data Retrieval and Preparation

In [2]:
# import libs
import pandas as pd

In [3]:
# Load data
train = pd.read_csv('https://raw.githubusercontent.com/biplav-s/course-nl/8f0bb9e50db6706595e6d5ca38c39d31e9bfc77b/common-data/Kaggle-fake-news-train.csv', header=0, lineterminator='\n')

nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 20800 rows and 5 columns in the training set.


In [4]:
# Seeing a data sample
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
# Clean of white spaces
train = train.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [6]:
# Print statistics
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [7]:
# Distribution of positive and negative labels
train["label"].value_counts()

# Notice that it is quite balanced

1    10413
0    10387
Name: label, dtype: int64

In [8]:
# Removing empty rows from csv 
train.dropna(axis=0,inplace=True)
nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 18285 rows and 5 columns in the training set.


In [9]:
# Distribution of positive and negative labels after removing rows with empty data
train["label"].value_counts()

0    10361
1     7924
Name: label, dtype: int64

# Building a classification model

In [10]:
# Add a column with content from title, author and text. 
# We will check if the classification on cotent that incluses just 'title' is better than with 'all content', or not.

In [11]:
train['all_content'] = train['title'] + ' '+ train['author'] + train['text']

In [12]:
# Check new column
train.head()

Unnamed: 0,id,title,author,text,label,all_content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


## Text processing

In [13]:
# Import for tokenization. We can use any library - nltk or spacy - for example.
from nltk.tokenize import word_tokenize

In [14]:
# Tokenize
train['title_tokenize'] = train['title'].apply(word_tokenize)
train['all_content_tokenize'] = train['all_content'].apply(word_tokenize)

train.head()

Unnamed: 0,id,title,author,text,label,all_content,title_tokenize,all_content_tokenize
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,"[House, Dem, Aide, :, We, Didn, ’, t, Even, Se...","[House, Dem, Aide, :, We, Didn, ’, t, Even, Se..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...","[FLYNN, :, Hillary, Clinton, ,, Big, Woman, on...","[FLYNN, :, Hillary, Clinton, ,, Big, Woman, on..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...,"[Why, the, Truth, Might, Get, You, Fired]","[Why, the, Truth, Might, Get, You, Fired, Cons..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...,"[15, Civilians, Killed, In, Single, US, Airstr...","[15, Civilians, Killed, In, Single, US, Airstr..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...,"[Iranian, woman, jailed, for, fictional, unpub...","[Iranian, woman, jailed, for, fictional, unpub..."


In [15]:
# We can extract the label column from train dataframe to be the target 'y' variable
targets = train['label'].values

In [16]:
# Import for representation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [17]:
# Initialize the model
vectorizer = TfidfVectorizer()

In [18]:
# Get the tf idf representation for title fields
X = vectorizer.fit_transform(train['title'].values)

In [19]:
#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(X, targets, random_state=0)

## Now learning classifier

In [20]:
# Import for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [21]:
# Try titles on random forest
RandomFC= RandomForestClassifier(n_estimators=5)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(n_estimators=5)

In [22]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)

Accuracy of randomforest classifier on training set: 0.99
Accuracy of randomforest classifier on test set: 0.91
[[2303  280]
 [ 121 1868]]


In [23]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=100000.0)

In [24]:
# Print accuracy
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.93
[[2376  207]
 [  98 1891]]


## Now train using all content

In [25]:
# Get the tf idf representation for title fields
X = vectorizer.fit_transform(train['all_content'].values)

In [26]:
from sklearn.model_selection import train_test_split

#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(X, targets, random_state=0)

In [27]:
# Do random forest
RandomFC= RandomForestClassifier(n_estimators=5)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(n_estimators=5)

In [28]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)

Accuracy of randomforest classifier on training set: 0.99
Accuracy of randomforest classifier on test set: 0.83
[[2265  318]
 [ 480 1509]]


In [29]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=100000.0)

In [30]:

# Print stats
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.98
[[2528   55]
 [  50 1939]]


## With all_content, Random forest lost performance while logistic regression improved