In [None]:
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
from collections import defaultdict
import re
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from bs4 import BeautifulSoup

import sys
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import metrics 

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, GlobalMaxPooling1D, LSTM
from keras.models import Model, Sequential

#Заведомо определим гиперпарметры текста в моделях 
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.3
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
# Use English stemmer.
word_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
#Функции для лексиграфической очистки всех строк в данных
def decontracted(string):
    # specific
    string = str(string)
    string = re.sub(r"won\'t", "will not", string)
    string = re.sub(r"can\'t", "can not", string)

    # general
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'s", " is", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"\'t", " not", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"\'m", " am", string)
    return string

def clean_str(string):
    """
    Cleaning of dataset
    """
    stop = stopwords.words('english')
    
    string = string.str.lower()
    
    string = string.apply(lambda elem: decontracted(elem))
    
    #remove special characters
    string = string.apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    
    # remove numbers
    string = string.apply(lambda elem: re.sub(r"\d+", "", elem))
    
    #remove stopwords
    string = string.apply(lambda x: ' '.join([word.strip() for word in x.split() if word not in (stop)]))
    
    return string


def clean_string(string):
    
    stop = stopwords.words('english')
    string = string.lower()
    string = re.sub(r'([^\w\s]|_)','', string)
    
    text = [word.strip() for word in string.split() if word not in stop]
        
    return text

In [None]:
#Функция для препроцессинга данных из датасета в готовые для обучения выборки 
def textpreproc(data, columns): 
    texts = []
    labels = []
     
    texts = clean_str(data[columns[0]]) + " " + clean_str(data[columns[1]])
    labels = data[columns[2]]
                  
                      
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    # Padding input sequences
    texts = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of texts tensor:', texts.shape)
    print('Shape of label tensor:', labels.shape)
    
    X_train,X_test,y_train,y_test = train_test_split(texts,labels,
                                                 random_state = 42, test_size=VALIDATION_SPLIT, shuffle=True)
                           
    return X_train, X_test, y_train, y_test, tokenizer

In [None]:
#Загрузка и формирование вектора эмбеддингов
def get_embeddings(path, embeddings_index):
  wv_from_bin = KeyedVectors.load_word2vec_format(path, binary=True, limit=500000) 
  #extracting word vectors from google news vector
  for word, vector in zip(wv_from_bin.index_to_key, wv_from_bin.vectors):
      coefs = np.asarray(vector, dtype='float32')
      embeddings_index[word] = coefs
  
  return embeddings_index

In [None]:
embeddings_index = {}
embeddings_index = get_embeddings('GoogleNews-vectors-negative300.bin', embeddings_index)
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#Функция по созданию матрицы эмбеддингов для входного слоя сетей 
def embedd_matrix(embeddings_index, tokenizer): 
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, i in tokenizer.word_index.items():
        try:
            embedding_vector = embeddings_index[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
    return embedding_matrix


In [None]:
# Первая модель на основе сверточной нейронной сети
def cnn_net(embedding_matrix):
    model = Sequential()

    #Non-trainable embeddidng layer
    model.add(Embedding(embedding_matrix.shape[0], output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    
    model.add(Dropout(0.2))
    model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.2))
    model.add(Dense(units = 250 , activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Модель на основе сети LSTM
def lstm_net(embedding_matrix):
    model = Sequential()

    #Non-trainable embeddidng layer
    model.add(Embedding(embedding_matrix.shape[0], output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    
    model.add(LSTM(units=128 , return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.1))
    model.add(Dense(units = 32 , activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def cnn_lstm(embedding_matrix):
    
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Conv1D(32, 4, activation='relu', padding='same'))
    model.add(LSTM(32, return_sequences=True))
    model.add(Dropout(0.1))          
    model.add(MaxPooling1D(2))
    model.add(Conv1D(16, 8, activation="relu", padding='same'))
    model.add(LSTM(64, return_sequences=True))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.1))
    model.add(Conv1D(16, 8, activation="relu", padding='same'))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam')
    
    return model
    

In [None]:
# Произведем тест наших моделей на датасете Fake News ...
news_true = pd.read_csv("True_ISOT.csv")
news_fake = pd.read_csv("Fake_ISOT.csv")

news_true.insert(4, 'is_true', 1)
news_fake.insert(4, 'is_true', 0)

news1 = pd.concat([news_true, news_fake])

X_train1, X_test1, y_train1, y_test1, tokenizer1 = textpreproc(news1, ['title', 'text', 'is_true'])
embed_matrix1 = embedd_matrix(embeddings_index, tokenizer1)

In [None]:
 covid = pd.read_excel('COVID_news.xlsx')

In [None]:
covid.text.apply(lambda x: len(x.split())).mean()

In [None]:

covid.shape

In [None]:
print(news1.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
news_true = pd.read_csv("True_ISOT.csv")
news_fake = pd.read_csv("Fake_ISOT.csv")
news1 = pd.concat([news_true, news_fake])



ax, fig = plt.subplots(figsize=[11,6])
sns.countplot(x="subject", data=news1)

In [None]:
fig , ax = plt.subplots(1,2,figsize=(19,5))
g1 = sns.countplot(news1.label,ax=ax[0]);
g1.set_title("Count of real and fake data")
g1.set_ylabel("Count")
g1.set_xlabel("Target")
g2 = plt.pie(data["target"].value_counts().values,explode=[0,0],labels=data.target.value_counts().index, autopct='%1.1f%%',colors=['SkyBlue','PeachPuff'])
fig.show()

In [None]:
print(X_train1.shape)

In [None]:
from keras.callbacks import ModelCheckpoint

model_cnn1 = cnn_net(embed_matrix1)

batch_size = 256
epochs = 10

filepath="weights_cnn1.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history_cnn1 = model_cnn1.fit(X_train1, y_train1, batch_size = batch_size ,
                              validation_data = (X_test1,y_test1) , epochs = epochs, callbacks=callbacks_list)

In [None]:
lstm = cnn_net(embed_matrix1)

lstm.summary()

In [None]:
###### import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, figsize=(8,10))

axs[0].plot(history_cnn1.history['accuracy'])
axs[0].plot(history_cnn1.history['val_accuracy'])
axs[0].set_title('CNN-based model accuracy on 1st Dataset')
axs[0].set_ylabel('accuracy')
axs[0].set_xlabel('epoch')
axs[0].legend(['train', 'test'], loc='lower right')
# summarize history for loss
axs[1].plot(history_cnn1.history['loss'])
axs[1].plot(history_cnn1.history['val_loss'])
axs[1].set_title('CNN-based model loss')
axs[1].set_ylabel('loss')
axs[1].set_xlabel('epoch')
axs[1].legend(['train', 'test'], loc='upper right')

plt.show()

In [None]:
from keras.callbacks import ModelCheckpoint


model_lstm1 = lstm_net(embed_matrix1)

batch_size = 256
epochs = 10

filepath="weights_lstm1.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history_lstm1 = model_lstm1.fit(X_train1, y_train1, batch_size = batch_size , 
                              validation_data = (X_test1,y_test1) , epochs = epochs)

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, figsize=(8,10))

axs[0].plot(history_lstm1.history['accuracy'])
axs[0].plot(history_lstm1.history['val_accuracy'])
axs[0].set_title('LSTM-based model accuracy on 1st Dataset')
axs[0].set_ylabel('accuracy')
axs[0].set_xlabel('epoch')
axs[0].legend(['train', 'test'], loc='lower right')
# summarize history for loss
axs[1].plot(history_lstm1.history['loss'])
axs[1].plot(history_lstm1.history['val_loss'])
axs[1].set_title('LSTM-based model loss')
axs[1].set_ylabel('loss')
axs[1].set_xlabel('epoch')
axs[1].legend(['train', 'test'], loc='upper right')

plt.show()

###LOAD BEST then calc accuracy

In [None]:
from keras.callbacks import ModelCheckpoint

covid = pd.read_excel('COVID_news.xlsx')

X_train,X_test,y_train,y_test, tokenizer1 = textpreproc(covid, ['title', 'text', 'label'])

embed_matrix1 = embedd_matrix(embeddings_index, tokenizer1)
cnn_net_covid = cnn_net(embed_matrix1)

batch_size = 10
epochs = 10

filepath="weights_cnn2.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history_cnn1 = cnn_net_covid.fit(X_train, y_train, batch_size = batch_size ,
                              validation_data = (X_test,y_test) , epochs = epochs, callbacks=callbacks_list)

In [None]:
from keras.callbacks import ModelCheckpoint

covid = pd.read_excel('COVID_news.xlsx')

X_train,X_test,y_train,y_test, tokenizer1 = textpreproc(covid, ['title', 'text', 'label'])

embed_matrix1 = embedd_matrix(embeddings_index, tokenizer1)
lstm_net_covid = lstm_net(embed_matrix1)

batch_size = 15
epochs = 10

filepath="weights_lstm2.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history_cnn1 = lstm_net_covid.fit(X_train, y_train, batch_size = batch_size ,
                              validation_data = (X_test,y_test) , epochs = epochs, callbacks=callbacks_list)

In [None]:
model.evaluate()