In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf


In [2]:
#Read the data
df=pd.read_csv('news.csv')
#Get shape and head
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#DataFlair - Get the labels
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [4]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words=['english', 'the'], max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [6]:
#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.05%


In [7]:
#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[591,  47],
       [ 41, 588]], dtype=int64)

## Now lets use deep learning and see the differnece

In [8]:
## data preprocessing

In [31]:
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [32]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [33]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [34]:
## title and text will be two inputs for our model and output will bw bibary 1 for fake and 0 for real

In [35]:
def label_to_binary(element):
    if element == 'FAKE':
        return 1
    else :
        return 0
    
df['label']  = df['label'].map(label_to_binary)

In [36]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['title'] + ' ' + df['text'], df['label'], test_size = 0.2, random_state = 7)

In [None]:
## now we need to use two tokenizer to convert title and text so that we can feed it to our model
df['title'] + ' ' + df

In [19]:
# using keras tokenizer

tokenizer = tf.keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(X_train.values)


tokenizer_config  = tokenizer.get_config()


In [20]:
tokenizer_config.keys()

dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])

In [21]:
import json
word_index = json.loads(tokenizer_config['word_index'])
index_word = json.loads(tokenizer_config['index_word'])


In [22]:
index_word

{'1': 'the',
 '2': 'to',
 '3': 'of',
 '4': 'and',
 '5': 'a',
 '6': 'in',
 '7': 'that',
 '8': 'is',
 '9': 'for',
 '10': 'on',
 '11': 'as',
 '12': 'it',
 '13': 'with',
 '14': 'he',
 '15': 'was',
 '16': 'are',
 '17': 'be',
 '18': 'by',
 '19': 'this',
 '20': 'have',
 '21': 'not',
 '22': '”',
 '23': 'his',
 '24': 'has',
 '25': 'at',
 '26': 'said',
 '27': 'but',
 '28': 'from',
 '29': 'trump',
 '30': 'they',
 '31': 'an',
 '32': 'who',
 '33': 'i',
 '34': 'will',
 '35': 'we',
 '36': 'clinton',
 '37': 'or',
 '38': 'about',
 '39': 'you',
 '40': 'their',
 '41': 'more',
 '42': 'would',
 '43': 'all',
 '44': 'one',
 '45': 'been',
 '46': 'her',
 '47': 'people',
 '48': 'if',
 '49': 'what',
 '50': 'were',
 '51': 'she',
 '52': 'had',
 '53': 'which',
 '54': 'out',
 '55': 'so',
 '56': 'new',
 '57': 'when',
 '58': 'up',
 '59': 'there',
 '60': 'state',
 '61': 'no',
 '62': 'than',
 '63': 'can',
 '64': 'president',
 '65': '—',
 '66': 'our',
 '67': 'also',
 '68': 'us',
 '69': 'other',
 '70': 'campaign',
 '71': 

In [23]:
## now lets convert texts into sequences

X_train_seq = tokenizer.texts_to_sequences(X_train.values)


X_test_seq = tokenizer.texts_to_sequences(X_test.values)



In [24]:
## now lets pad the sequences

In [25]:
X_train_seq_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen = 500)

X_test_seq_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen = 500)


In [26]:
input_dim_emb= len(word_index)+1
input_dim_emb

88927

In [27]:
## now we are ready to create our model

In [28]:
def get_model(input_dim_emb):
    
    
    inputs = tf.keras.layers.Input(shape = (None, ))

    x = tf.keras.layers.Embedding(input_dim_emb, output_dim=10, mask_zero=True)(inputs)
    x = tf.keras.layers.Masking()(x)
    x = tf.keras.layers.Dropout(0.4)(x)

    x = tf.keras.layers.LSTM(32, return_sequences=True)(x)
    x = tf.keras.layers.Dropout(0.4)(x)

    x = tf.keras.layers.LSTM(16)(x)
    x = tf.keras.layers.Dropout(0.4)(x)

    x = tf.keras.layers.Dense(16, activation = 'relu')(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.Dense(8, activation = 'relu')(x)
    x = tf.keras.layers.Dropout(0.4)(x)

    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.models.Model(inputs  =  inputs, outputs = outputs)
    model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [29]:
model = get_model(input_dim_emb)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 10)          889270    
_________________________________________________________________
masking (Masking)            (None, None, 10)          0         
_________________________________________________________________
dropout (Dropout)            (None, None, 10)          0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 32)          5504      
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                3136  

In [48]:
history = model.fit(X_train_seq_padded, y_train.values , epochs = 10, batch_size = 32, validation_data = (X_test_seq_padded, y_test.values))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
y_pred = model.predict(X_test_seq_padded)

In [50]:
y_pred = np.round(y_pred)

In [51]:
print(confusion_matrix(y_test.values, y_pred))

[[551  78]
 [ 73 565]]


In [82]:
x_val = tokenizer.texts_to_sequences(['''India was downgraded to "partly free" for the first time since 1997 in an annual ranking of democracies by the U.S.-government funded research group Freedom House, which cited worsening civil rights under Prime Minister Narendra Modi.

The world's largest democracy slipped in rankings this year because of continuing discrimination against its Muslim citizens and increased harassment of government critics and journalists, according to the "Freedom in the World' report released by the Washington-based organization.

The report cited "a multi-year pattern in which the Hindu nationalist government and its allies have presided over rising violence and discriminatory policies." It listed several events in 2020 like religious riots in Delhi, use of sedition laws against critics and hardships endured by migrant workers after PM Modi announced a sudden lockdown to control the coronavirus pandemic.

India was among 73 nations downgraded for declines in political rights and civil liberties, affecting three-fourths of the world's population. The report, which ranks 210 nations, found that states designated "Not Free" have reached the highest since 2006. Those affected included not just authoritarian states like China, Belarus, and Venezuela, but also troubled democracies like the U.S. and India.

India's status change means that less than 20% of the world's people now live in a "free" country -- the smallest proportion since 1995, the report said. The changes in India since PM Modi took charge in 2014 "form part of a broader shift in the international balance between democracy and authoritarianism, with authoritarians generally enjoying impunity for their abuses and seizing new opportunities to consolidate power or crush dissent," the report said.'''])

In [83]:
x_val

[[1500,
  15,
  34344,
  2,
  4744,
  334,
  9,
  1,
  99,
  78,
  167,
  4471,
  6,
  31,
  2195,
  3454,
  3,
  8320,
  18,
  1,
  92,
  93,
  107,
  2519,
  649,
  249,
  653,
  111,
  53,
  2045,
  12543,
  659,
  311,
  191,
  1022,
  663,
  10732,
  7874,
  1,
  4366,
  1311,
  987,
  7187,
  6,
  12410,
  19,
  110,
  91,
  3,
  2582,
  2493,
  103,
  77,
  858,
  754,
  4,
  1306,
  5844,
  3,
  107,
  1472,
  4,
  1618,
  178,
  2,
  1,
  653,
  6,
  1,
  272,
  561,
  18,
  1,
  195,
  383,
  913,
  1,
  272,
  2045,
  5,
  4006,
  110,
  3168,
  6,
  53,
  1,
  5152,
  5238,
  107,
  4,
  77,
  885,
  20,
  9858,
  72,
  1668,
  551,
  4,
  13892,
  645,
  12,
  3861,
  353,
  862,
  6,
  4624,
  76,
  735,
  4308,
  6,
  9449,
  209,
  3,
  25127,
  797,
  103,
  1472,
  4,
  17833,
  7794,
  18,
  4810,
  759,
  71,
  2830,
  7874,
  656,
  5,
  4746,
  37240,
  2,
  320,
  1,
  10363,
  1500,
  15,
  221,
  5764,
  822,
  34344,
  9,
  9790,
  6,
  104,
  311,
  4,
  659,

In [84]:
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=500)

In [85]:
model.predict(x_val)

array([[3.964668e-05]], dtype=float32)