## Supervised Classification Illustration

In [1]:
## Based on NLP series by WomenWhoCode: 
#  - https://github.com/WomenWhoCode/WWCodeDataScience/blob/master/Intro_to_NLP/5_NLP_SupervisedTextClassification.ipynb
## Uses data on kaggale at:
#  - https://www.kaggle.com/c/fake-news/data

In [2]:
# imports for file loading
import pandas as pd

# For Training Data

In [3]:
# Load data
train = pd.read_csv('../common-data/Kaggle-fake-news-train.csv', header=0, lineterminator='\n')

nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 20800 rows and 5 columns in the training set.


In [4]:
# Clean of white spaces
train = train.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
# Distribution of positive and negative labels
train["label"].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [6]:
# Removing empty rows from csv 
train.dropna(axis=0,inplace=True)
nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 18285 rows and 5 columns in the training set.


In [7]:
# Distribution of positive and negative labels
train["label"].value_counts()

0    10361
1     7924
Name: label, dtype: int64

In [8]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# Build a supervised model

In [9]:
# Add a column with content from title, authot and text. 
# We will see classification with title is better that with total content

In [10]:
train['total'] = train['title'] + ' '+ train['author'] + train['text']

In [11]:
# Check
train.head()

Unnamed: 0,id,title,author,text,label,total
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [12]:
# Import for tokenization 
from nltk.tokenize import word_tokenize

In [13]:
# Tokenize
train['title_tokenize'] = train['title'].apply(word_tokenize)
train['total_tokenize'] = train['total'].apply(word_tokenize)

train.head()

Unnamed: 0,id,title,author,text,label,total,title_tokenize,total_tokenize
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,"[House, Dem, Aide, :, We, Didn, ’, t, Even, Se...","[House, Dem, Aide, :, We, Didn, ’, t, Even, Se..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...","[FLYNN, :, Hillary, Clinton, ,, Big, Woman, on...","[FLYNN, :, Hillary, Clinton, ,, Big, Woman, on..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...,"[Why, the, Truth, Might, Get, You, Fired]","[Why, the, Truth, Might, Get, You, Fired, Cons..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...,"[15, Civilians, Killed, In, Single, US, Airstr...","[15, Civilians, Killed, In, Single, US, Airstr..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...,"[Iranian, woman, jailed, for, fictional, unpub...","[Iranian, woman, jailed, for, fictional, unpub..."


In [14]:
# We can extract the label column from train dataframe to be the target 'y' variable
targets = train['label'].values

In [15]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [16]:
# We will consider 1- and 2- gram
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))

### Now train using the title text only 

In [17]:
# fit training data to the count vectorizer
train_counts = count_vectorizer.fit_transform(train['title'].values)

#fit the ngrams count to the tfidf transformers
train_tfidf = transformer.fit_transform(train_counts)


In [18]:
from sklearn.model_selection import train_test_split

#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, targets, random_state=0)

In [19]:
# Import for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [20]:
# Try titles on random forest
RandomFC= RandomForestClassifier(n_estimators=5)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)

Accuracy of randomforest classifier on training set: 0.99
Accuracy of randomforest classifier on test set: 0.90
[[2290  293]
 [ 142 1847]]


In [22]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# Print accuracy
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.93
[[2328  255]
 [  46 1943]]


### Now train using the total content

In [24]:
# fit training data to the count vectorizer
train_counts = count_vectorizer.fit_transform(train['total'].values)

#fit the ngrams count to the tfidf transformers
train_tfidf = transformer.fit_transform(train_counts)

In [26]:
from sklearn.model_selection import train_test_split

#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, targets, random_state=0)

In [27]:
# Do random forest
RandomFC= RandomForestClassifier(n_estimators=5)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)

Accuracy of randomforest classifier on training set: 0.98
Accuracy of randomforest classifier on test set: 0.82
[[2295  288]
 [ 534 1455]]


In [29]:
# We will try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)


LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
# Print stats
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.98
[[2546   37]
 [  58 1931]]


### Learning on title lead to better result than full content 