# Import libraries

In [17]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# copy file from another directory and rename it

In [2]:
current_directory = !pwd
os.chdir('/home/amir/Desktop/learning/NLP/NLP')
!cp fake_or_real_news.csv /home/amir/Desktop/github/NLP/
os.chdir(current_directory[0])
os.rename('fake_or_real_news.csv', 'labeld_news.csv')

In [3]:
df = pd.read_csv('labeld_news.csv')

# shape of data
print('Entire data shape: ', df.shape)

# response variabel
y = df.label

# split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y)

# train / test shape
print('train qty: ', len(y_train))
print('test qty: ', len(y_test))

Entire data shape:  (6335, 4)
train qty:  4751
test qty:  1584


## we use 2 ways to vectorize data
##### CountVectorizer
##### TfidfVectorizer

#### CountVectorizer

In [4]:
# CountVectorizer
# CountVectorizer = token accurence in this observation 
# see https://github.com/amirsaleem1990/NLP/blob/master/CountVectorizer%20(sklearn).png

# CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english")
# <stop_words= "english"> English language k stop words ko drop kar do

# transform the train data
count_train = count_vectorizer.fit_transform(X_train,y_train)

# transform the test data
count_test = count_vectorizer.transform(X_test)

# hwo many features in our model?
print('we have {} features'.format(len(count_vectorizer.vocabulary_)))

print('First 10 Features in our model: \n', count_vectorizer.get_feature_names()[:10])
# in the count_train, and count_test we have no column, you can 
# identify each feature by: 
# count_vectorizer.vocabulary_
# this command return a dictnory, where  each key is actual feature, and each value is a index of column.

we have 59816 features
First 10 Features in our model: 
 ['00', '000', '0000', '000000031', '00000031', '0001', '0001pt', '0002', '000billion', '000ft']


#### TfidfVectorizer

In [5]:
# TfidfVectorizer
# how its work: https://github.com/amirsaleem1990/NLP/blob/master/sklearn%20TFIDF.png
# how its work: https://github.com/amirsaleem1990/NLP/blob/master/TF-IDF.png

# tfidf_vectorizer object
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5)
# <max_df = 0.5> koi word agar 50% yar is sy zyada observations me repeat ho raha ho to us work ko drop kar do,
# ye bohot comon word h.
# <stop_words= "english"> English language k stop words ko drop kar do

# transform the train data
tfidf_train = tfidf_vectorizer.fit_transform(X_train,y_train)

# transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test)

# hwo many features in our model?
print('we have {} features\n'.format(len(tfidf_vectorizer.vocabulary_)))

print('First 10 Features in our model: \n', tfidf_vectorizer.get_feature_names()[:10])

# in the tfidf_train, and tfidf_test we have no column, you can 
# identify each feature by: 
# count_vectorizer.vocabulary_
# this command return a dictnory, where  each key is actual feature, and each value is a index of column.

we have 59812 features

First 10 Features in our model: 
 ['00', '000', '0000', '000000031', '00000031', '0001', '0001pt', '0002', '000billion', '000ft']


In [16]:
# is Features of CountVectorizer and Features of TfidfVectorizer are the same, or there are deffrence?

count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.A,columns=tfidf_vectorizer.get_feature_names())

print('These Features just in one dataframe:')
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

These Features just in one dataframe:
{'new', 'said', 'people', 'time'}
False


## Applying Naive Bayes to Classify Fake News on Count Vectorizer

In [18]:
# Multinomial Naive Bayes classifier object
nb_classifier = MultinomialNB()

# training data
nb_classifier.fit(count_train, y_train)

# predect on test data
pred = nb_classifier.predict(count_test)

# Calculate the accuracy
accuracy = metrics.accuracy_score(y_test,pred)
print(accuracy)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test,pred,labels=['FAKE','REAL'])
print(cm)

0.8939393939393939
[[658 120]
 [ 48 758]]


# Applying Naive Bayes to Classify Fake News on Tf-Idf Vectorizer

In [19]:
# Multinomial Naive Bayes classifier object
nb_classifier = MultinomialNB()

# training data
nb_classifier.fit(tfidf_train,y_train)

# predect on test data
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy
accuracy = metrics.accuracy_score(y_test, pred)
print(accuracy)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE','REAL'])
print(cm)

0.8522727272727273
[[557 221]
 [ 13 793]]


# effect of alpha on accuracy

In [21]:
import numpy as np
# diffrent aplha's
alphas = np.arange(0,1,0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train,y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test,pred)
    return score

# train over diffrent alpha values, and print the scores
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print("\n")

Alpha:  0.0


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  0.8882575757575758


Alpha:  0.1
Score:  0.9027777777777778


Alpha:  0.2
Score:  0.898989898989899


Alpha:  0.30000000000000004
Score:  0.8907828282828283


Alpha:  0.4
Score:  0.8813131313131313


Alpha:  0.5
Score:  0.875


Alpha:  0.6000000000000001
Score:  0.8667929292929293


Alpha:  0.7000000000000001
Score:  0.8611111111111112


Alpha:  0.8
Score:  0.8592171717171717


Alpha:  0.9
Score:  0.8554292929292929




# Investigating Fake News Classifier Model

In [22]:
class_labels = nb_classifier.classes_

# Extract the features
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

FAKE [(-11.388621924420084, '0000'), (-11.388621924420084, '0001'), (-11.388621924420084, '0001pt'), (-11.388621924420084, '0002'), (-11.388621924420084, '000billion'), (-11.388621924420084, '000km'), (-11.388621924420084, '004s'), (-11.388621924420084, '006s'), (-11.388621924420084, '007'), (-11.388621924420084, '007s'), (-11.388621924420084, '008s'), (-11.388621924420084, '0099'), (-11.388621924420084, '00am'), (-11.388621924420084, '00p'), (-11.388621924420084, '00pm'), (-11.388621924420084, '013c2812c9'), (-11.388621924420084, '014'), (-11.388621924420084, '018'), (-11.388621924420084, '01am'), (-11.388621924420084, '020')]
REAL [(-7.730884958182809, 'presidential'), (-7.70946186677282, 'rubio'), (-7.700150938819911, 'democratic'), (-7.6945129012222635, 'states'), (-7.688808900222656, 'gop'), (-7.679324740060702, 'bush'), (-7.61355756825691, 'voters'), (-7.596668564101243, 'republicans'), (-7.584789476246648, 'house'), (-7.511875829951274, 'percent'), (-7.396256259160873, 'party'),