In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [2]:
def generator_network(latent_dim):
    input_layer = Input(shape=(latent_dim,))
    hidden_layer = Dense(128)(input_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    hidden_layer = Dense(256)(hidden_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    hidden_layer = Dense(512)(hidden_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    output_layer = Dense(1000, activation='tanh')(hidden_layer)
    model = Model(inputs=[input_layer], outputs=[output_layer])
    return model

In [3]:
def discriminator_network(input_dim):
    input_layer = Input(shape=(input_dim,))
    hidden_layer = Dense(512)(input_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    hidden_layer = Dropout(0.3)(hidden_layer)
    hidden_layer = Dense(256)(hidden_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    hidden_layer = Dropout(0.3)(hidden_layer)
    hidden_layer = Dense(128)(hidden_layer)
    hidden_layer = LeakyReLU(alpha=0.2)(hidden_layer)
    output_layer = Dense(1, activation='sigmoid')(hidden_layer)
    model = Model(inputs=[input_layer], outputs=[output_layer])
    return model

In [4]:
def gan_network(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam())
    discriminator.trainable = False
    input_layer = Input(shape=(latent_dim,))
    generated_data = generator(input_layer)
    output_layer = discriminator(generated_data)
    model = Model(inputs=[input_layer], outputs=[output_layer])
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5))
    return model

In [1]:
num_epochs = 500
batch_size = 32
latent_dim = 100
input_dim = 1000

# Build the GAN model
generator = generator_network(latent_dim)
discriminator = discriminator_network(input_dim)
gan = gan_network(generator, discriminator)

NameError: name 'generator_network' is not defined

In [6]:
import re
import string

def clean_text(text):
    if isinstance(text, str) or isinstance(text, bytes):
        # Remove any URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove any punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Convert the text to lowercase
        text = text.lower()
        return text
    
    return ""

In [7]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the CSV file
data = pd.read_csv('WELFake_Dataset.csv')

# Remove any unnecessary columns
data = data[['text', 'label']]

# Remove any rows with empty text
data.dropna(subset=['text'], inplace=True)

# Clean the text data
data['text'] = data['text'].apply(clean_text)

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data)
train_seq = tokenizer.texts_to_sequences(train_data)
test_seq = tokenizer.texts_to_sequences(test_data)

# Pad the sequences
max_len = 1000
train_seq_pad = pad_sequences(train_seq, maxlen=max_len)
test_seq_pad = pad_sequences(test_seq, maxlen=max_len)

# Flatten the training sequence array
real_news = train_seq_pad.flatten()

In [8]:
for epoch in range(num_epochs):
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    generated_data = generator.predict(noise)
    idx = np.random.randint(10, size=32)
    real_data = train_seq_pad[idx]
    x = np.concatenate((real_data, generated_data))
    y = np.zeros(2*batch_size)
    y[:batch_size] = 1
    discriminator.compile(loss='binary_crossentropy', optimizer=Adam())
    discriminator_loss = discriminator.train_on_batch(x, y)

    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    y = np.ones(batch_size)
    gan_loss = gan.train_on_batch(noise, y)

    if epoch % 100 == 0:
        print('Epoch:', epoch, 'Discriminator Loss:', discriminator_loss, 'GAN Loss:', gan_loss)


Epoch: 0 Discriminator Loss: 117.54248046875 GAN Loss: 0.699992299079895
Epoch: 100 Discriminator Loss: 109.30059051513672 GAN Loss: 0.0025857272557914257
Epoch: 200 Discriminator Loss: 122.4171142578125 GAN Loss: 0.00027920957654714584
Epoch: 300 Discriminator Loss: 141.5188751220703 GAN Loss: 0.00023650802904739976


: 

: 

In [3]:
import os.path
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem   import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

In [4]:
path = os.path.join('WELFake_Dataset.csv')
df = pd.read_csv(path, index_col=0)

In [5]:
def preprocess_text(text):
    text = ''.join([c for c in text if c not in string.punctuation and c not in string.digits])
    tokens = word_tokenize(text, 'english')
    lemmatiser = WordNetLemmatizer()
    lemmatized = [lemmatiser.lemmatize(word) for word in tokens]
    sw = stopwords.words('english')
    stopped = [word for word in lemmatized if word.lower() not in sw]
    return stopped
preprocess_text(df.loc[0, 'title'])

['LAW',
 'ENFORCEMENT',
 'HIGH',
 'ALERT',
 'Following',
 'Threats',
 'Cops',
 'Whites',
 'BlackLivesMatter',
 'FYF',
 'Terrorists',
 'VIDEO']

In [6]:
from sklearn.model_selection import train_test_split

X = df['title']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
X_train.shape[0], X_test.shape[0]

(57707, 14427)

In [7]:
y_train.value_counts(normalize=True)

1    0.513508
0    0.486492
Name: label, dtype: float64

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
X_train = X_train.fillna('')
bow_transformer = CountVectorizer(analyzer=preprocess_text).fit(X_train)
text_bow_train = bow_transformer.transform(X_train)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(n_estimators=300)

scores = cross_validate(model, text_bow_train, y_train, scoring=['f1', 'accuracy'], cv=5, n_jobs=-1)
pd.DataFrame(scores).describe()

In [None]:
model.fit(text_bow_train, y_train)

In [None]:
from sklearn.metrics import classification_report

X_test = X_test.fillna('')
text_bow_test = bow_transformer.transform(X_test)
print(text_bow_test)
y_pred = model.predict(text_bow_test)
print(classification_report(y_test, y_pred))