In [1]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/misinformation-fake-news-text-dataset-79k/EXTRA_RussianPropagandaSubset.csv
/kaggle/input/misinformation-fake-news-text-dataset-79k/DataSet_Misinfo_TRUE.csv
/kaggle/input/misinformation-fake-news-text-dataset-79k/DataSet_Misinfo_FAKE.csv


In [2]:
df_fake = pd.read_csv('/kaggle/input/misinformation-fake-news-text-dataset-79k/DataSet_Misinfo_FAKE.csv')
df_true = pd.read_csv('/kaggle/input/misinformation-fake-news-text-dataset-79k/DataSet_Misinfo_TRUE.csv')
dfr = pd.read_csv('/kaggle/input/misinformation-fake-news-text-dataset-79k/EXTRA_RussianPropagandaSubset.csv')

# Data Cleaning

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import spacy
from textblob import TextBlob
import seaborn as sns

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_fake.drop_duplicates(inplace=True)

In [5]:
df_true['label']=1
df_true=df_true.drop('Unnamed: 0',axis=1)
df_fake['label']=0
df_fake=df_fake.drop('Unnamed: 0',axis=1)

df = pd.concat([df_true, df_fake])

In [6]:
df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,1
1,Transgender people will be allowed for the fir...,1
2,The special counsel investigation of links bet...,1
3,Trump campaign adviser George Papadopoulos tol...,1
4,President Donald Trump called on the U.S. Post...,1


In [7]:
df.dropna(subset=['text'], inplace=True)
df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
df['text'] = df['text'].apply(lambda x: x.lower())
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
df['text'] = df['text'].apply(lambda x: nltk.word_tokenize(x))

In [None]:
stemmer = PorterStemmer()
df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x))
df['text'] = df['text'].apply(lambda x: re.sub('\d+', '', x))

In [None]:
extraction = spacy.load('en_core_web_sm')

def info(text):
    if isinstance(text, str):
        doc = extraction(text)
        country = ""
        org = ""
        person = ""
        for ent in doc.ents:
            if ent.label_ == "GPE":
                if not country:
                    country = ent.text
            elif ent.label_ == "ORG":
                if not org:
                    org = ent.text
            elif ent.label_ == "PERSON":
                if not person:
                    person = ent.text
        return pd.Series({'Country': country, 'Organization': org, 'Person': person})
    else:
        return pd.Series({'Country': "", 'Organization': "", 'Person': ""})

df[['Country', 'Organization', 'Person']] = df['text'].apply(info)

In [None]:
df.head()

In [None]:
def count_words(text):
    if isinstance(text, str):
        return len(text.split())-2
    else:
        return 0
df['Count'] = df['text'].apply(count_words)

In [None]:
grouped = df.groupby(['label', 'Person','Organization'])
counts = grouped.size()
top_15_counts = counts.sort_values(ascending=False).head(15)
top_15_df = pd.DataFrame({'counts': top_15_counts.values}, index=top_15_counts.index)

In [None]:
import matplotlib.pyplot as plt
plt.pie(top_15_df['counts'], labels=top_15_df.index, autopct='%1.1f%%')
plt.title('Top 15 Occurrences of label, Person and Organization')
plt.show()

In [None]:
grouped = df.groupby(['Country', 'Organization', 'Person'])
counts = grouped.size()
top_10_counts = counts.sort_values(ascending=False).head(15)
top_10_df = pd.DataFrame({'counts': top_10_counts.values}, index=top_10_counts.index)


In [None]:
import matplotlib.pyplot as plt
plt.pie(top_10_df['counts'], labels=top_10_df.index, autopct='%1.1f%%')
plt.title('Top 15 Occurrences of Country, Organization and Person')
plt.show()

In [None]:
top_10_df

In [None]:
x = df.drop('label', axis = 1)
y = df['label']

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
voc_size=5000

In [None]:
messages=x.copy()
messages.reset_index(inplace=True)

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

In [None]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

In [None]:
embedded_docs[0]

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
voc_size=5000
sent_length=20
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33,random_state=42)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

In [None]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
y_pred=model.predict_classes(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)