In [4]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
%matplotlib inline
import seaborn as sns

import numpy as np
import pandas as pd

import os
import re
import nltk


In [5]:
train = pd.read_csv('./fake-news/train.csv')
test = pd.read_csv('./fake-news/test.csv')

In [6]:
print(train.shape,test.shape)

(20800, 5) (5200, 4)


In [7]:
print(train.isnull().sum())
print()
print()
print(test.isnull().sum())

id           0
title      558
author    1957
text        39
label        0
dtype: int64


id          0
title     122
author    503
text        7
dtype: int64


In [8]:
test = test.fillna(' ') #replace the missing(NULL) values with a blank space
train = train.fillna(' ') 
test['total'] = test['title']+' '+test['author']+test['text']    #merge all the columns into a single column
train['total'] = train['title']+' '+train['author']+train['text']

## Creating WordCloud Visuals

In [36]:
real_words = ''
fake_words = ''
stopwords = set(STOPWORDS)

#iterate through the csv file
for val in train[train['label']==1].total:
    #split the value
    tokens=val.split()
    
    #convert each token into lowercase
    for i in range (len(tokens)):
        tokens[i] = tokens[i].lower()
        
    real_words += " ".join(tokens)+" "
    
for val in train[train['label']==0].total:
    
    #split value
    tokens = val.split()
    
    #convert each token into lowercase
    for i in range(len(tokens)):
        
        tokens[i] = tokens[i].lower()
        
    fake_words += " ".join(tokens)+" "
     

In [37]:
wordcloud = WordCloud(width = 800, height = 800,
                     background_color = 'white',
                     stopwords = stopwords,
                     min_font_size =10).generate(real_words)

#plot the wordcloud image
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()

NameError: name 'B' is not defined

## Cleaning and Pre-Processing

### 1. Regex


In [9]:
#remove punctuations from the string
s = "!</> hello please$$ </>^s!!!u%%bs&&%$cri@@@be^^^&&!& </>*to@# the&&\ cha@@@n##%^^&nel!@# %%$"


In [10]:
#defining a regular expression to remove punctuation from the given string
s = re.sub(r'[^\w\s]', '',s) #if any substring does not contain word(w) or space(s) then replace it with empty string

In [11]:
#string with punctuation removed
print(s)

 hello please subscribe to the channel 


### 2. Tokenisation

In [12]:
#download nltk data(required for tokenisation)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aryaman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
nltk.word_tokenize("Hello how are you") #tokenise the given string

['Hello', 'how', 'are', 'you']

### 3. StopWords

In [14]:
#nltk library that contains stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
print(stop_words) #print list of stopwords 

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aryaman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
sentence = "Covid-19 pandemic has impacted many countries and what it did to the economy is very stressful"

In [16]:
words = nltk.word_tokenize(sentence)
words = [w for w in words if w not in stop_words]

In [17]:
print(words) #only those words which hold meaning and can be used to mine important information 

['Covid-19', 'pandemic', 'impacted', 'many', 'countries', 'economy', 'stressful']


### 4. Lemmatization

In [18]:
#reduces a given word to its dictionary form(E.g studying is converted to its dictionary form: study)
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
     
input_str = "been had done languages cities mice"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aryaman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Aryaman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
#tokenise the sentence
input_str=nltk.word_tokenize(input_str)

#lemmatize each word
for word in input_str:
    print(lemmatizer.lemmatize(word))

been
had
done
language
city
mouse


### Application 

In [20]:
lemmatizer = WordNetLemmatizer()
for index,row in train.iterrows():
    filter_sentence = ''
    
    sentence = row['total']
    sentence = re.sub(r'[^\w\s]', '',sentence) #cleaning the sentence
    
    words = nltk.word_tokenize(sentence) #tokenization
    words = [w for w in words if not w in stop_words] #stopwords removal
    
    for word in words:
        filter_sentence = filter_sentence+' '+str(lemmatizer.lemmatize(word)).lower()
        
    train.loc[index,'total'] = filter_sentence    

In [21]:
train.head()

Unnamed: 0,id,title,author,text,label,total
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide we didnt even see comeys lette...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,flynn hillary clinton big woman campus breitb...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,why truth might get you fired consortiumnewsc...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 civilians killed in single us airstrike ha...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed fictional unpublished st...


In [22]:
train = train[['total','label']]

## Applying NLP Techniques

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
X_train = train['total']
Y_train = train['label']

## Bag-of-words/CountVectorizer

In [25]:
corpus = {
    
    'This is a document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    
    
    
}

vectorizer = CountVectorizer() #create a vector object
X = vectorizer.fit_transform(corpus) #convert the features to vector form by analysing each word
print(vectorizer.get_feature_names()) #print the feature name

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [26]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 0 0 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 2 0 1 0 1 1 0 1]]


## TF-iDF Vectorizer

In [27]:
#TF-IDF is a measure of originality of a word by comparing the number of times a word appears in a doc with the number of docs the word appears in
#max_features sends maximum number of features to be sent as an argument to the function
def vectorize_text(features, max_features):
    
    vectorizer = TfidfVectorizer( stop_words='english',
                                 decode_error='strict' ,
                                 analyzer='word' ,
                                 ngram_range=(1, 2), #individual tokens and two words will be considered same
                                 max_features=max_features
    
                                )
    feature_vec=vectorizer.fit_transform(features)
    return feature_vec.toarray()


In [28]:
tfidf_features = vectorize_text(['hello how are you doing','hi i am doing fine'],30)

In [29]:
#shows the weight of different words in the text according to their importance
#normalises on the basis of frequency also
tfidf_features

array([[0.44943642, 0.        , 0.        , 0.6316672 , 0.6316672 ,
        0.        , 0.        ],
       [0.33517574, 0.47107781, 0.47107781, 0.        , 0.        ,
        0.47107781, 0.47107781]])

## Application

In [30]:
#Feature Extraction using count vectorization and tfidf
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)


In [31]:
#sparse matrix to be converterd into normal array
tf_idf_matrix.toarray().shape

(20800, 220387)

## Confusion Matrix

In [37]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion Matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm,interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks,classes,rotation=45)
    plt.yticks(tick_marks,classes)
    
    if normalize:
        
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized Confusion Matrix")
    else:
        print('Confusion Matrix, without normalization')
        
    thresh = cm.max() / 2.
    for i,j in iter.product(range(cm.shape[0]), range(cm.shape[1])):
        
        plt.text(j,i, cm[i,j],
                horizontalalignment="center",
                color = "White" if cm[i,j]>thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

## Modelling

In [41]:
test_counts=count_vectorizer.transform(test['total'].values)
test_tfidf = tfidf.transform(test_counts)

#split in samples
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)

## Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5) # C is the regularization parameter
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print('Accuracy of Lasso classifier on training set: {:.2f}'.format(logreg.score(X_train,y_train)))
print('Accuracy of Lasso classifier on test data set: {:.2f}'.format(logreg.score(X_test,y_test)))
from sklearn.naive_bayes import MultinomialNB
cm = confusion_matrix(y_test, pred)
cm


Accuracy of Lasso classifier on training set: 1.00
Accuracy of Lasso classifier on test data set: 0.98


array([[2493,   71],
       [  44, 2592]], dtype=int64)

## MultinomialNB

In [48]:
from sklearn.naive_bayes import MultinomialNB

NB= MultinomialNB()
NB.fit(X_train,y_train)
pred = NB.predict(X_test)
print('Accuracy of NB classifier on training set: {:.2f}'.format(NB.score(X_train,y_train)))
print('Accuracy of NB classifier on test set: {:.2f}'.format(NB.score(X_test,y_test)))

cm = confusion_matrix(y_test,pred)
cm

Accuracy of NB classifier on training set: 0.88
Accuracy of NB classifier on test set: 0.83


array([[2558,    6],
       [ 853, 1783]], dtype=int64)