<a href="https://colab.research.google.com/github/amartyadey04/Jupyter-Notebook-prac/blob/main/ProjectWork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Fake** **News** **Detection** **Model** **Trial**

In [None]:
# ==========================================
# OPCNN Fake News Detection - ISOT Tiny Demo
# ==========================================

# --- 1. Setup ---
!pip install -q tensorflow scikit-learn pandas nltk tqdm

import os, re, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# --- 2. Load ISOT dataset (tiny subset for demo) ---
fake = pd.read_csv("Fake.csv").sample(100, random_state=SEED)  # take 100 fake
real = pd.read_csv("True.csv").sample(100, random_state=SEED)  # take 100 true
fake["label"] = 0
real["label"] = 1
df = pd.concat([fake, real]).reset_index(drop=True)

df["content"] = df["title"].fillna("") + " " + df["text"].fillna("")
df = df[["content", "label"]].dropna()

print("Dataset shape:", df.shape)
df.head()

# --- 3. Preprocessing (simple) ---
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\\S+", " ", text)       # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\\s]", " ", text)  # remove punctuation
    return text

df["clean"] = df["content"].apply(clean_text)

# --- 4. Tokenize & Pad ---
VOCAB_SIZE = 20000
MAXLEN = 32
EMBEDDING_DIM = 50  # use 50-d glove for speed in demo (200d in paper)

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean"])
seqs = tokenizer.texts_to_sequences(df["clean"])
X = pad_sequences(seqs, maxlen=MAXLEN, padding="post", truncating="post")
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)

# --- 5. Build OPCNN model (simplified) ---
def build_opcnn(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, maxlen=MAXLEN,
                filters=64, kernel_size=3, pool_size=2, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Dropout(dropout_rate))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation="relu"))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Flatten())
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
    return model

model = build_opcnn()
model.summary()

# --- 6. Train ---
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    epochs=5,
                    batch_size=16,
                    verbose=1)

# --- 7. Evaluate ---
y_pred_prob = model.predict(X_test).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Dataset shape: (200, 2)




Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 53ms/step - accuracy: 0.5035 - loss: 0.6946 - val_accuracy: 0.4062 - val_loss: 0.6976
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5896 - loss: 0.6743 - val_accuracy: 0.3750 - val_loss: 0.6984
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6577 - loss: 0.6526 - val_accuracy: 0.3750 - val_loss: 0.6969
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.7324 - loss: 0.6356 - val_accuracy: 0.4688 - val_loss: 0.6920
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8959 - loss: 0.6068 - val_accuracy: 0.4688 - val_loss: 0.6821
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Accuracy: 0.575
Precision: 1.0
Recall: 0.15
F1: 0.2608695652173913

Classification Report:
               precision    recall  f1-score   

# **Fake** **News** **Detection** **Model** **Creation**

In [None]:
!pip install -q tensorflow scikit-learn pandas nltk hyperopt tqdm
import os, re, json, random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# reproducibility (best effort)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)



#**Data** **Assigning**


In [None]:
# Put your dataset files in the working directory.
# Paper used ISOT, FakeNewsNet, Kaggle dataset, FA-KES5 — pick one or adapt paths. :contentReference[oaicite:9]{index=9}

def load_isot(fake_path='Fake.csv', real_path='True.csv'):
    fake = pd.read_csv(fake_path)   # expected columns: title, text, ...
    real = pd.read_csv(real_path)
    fake['label'] = 0
    real['label'] = 1
    df = pd.concat([fake, real], ignore_index=True)
    # prefer using 'title' + 'text' concatenated if present
    if 'title' in df.columns and 'text' in df.columns:
        df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
    else:
        # fallback: assume a 'text' or 'content' column exists
        df['content'] = df.get('text') if 'text' in df.columns else df.get('content')
    df = df[['content','label']].dropna().reset_index(drop=True)
    return df

# Example usage:
df = load_isot('Fake.csv', 'True.csv')
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (44898, 2)


Unnamed: 0,content,label
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0


#**Text** **Preprocessing**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

url_re = re.compile(r'https?://\S+|www\.\S+')
html_re = re.compile(r'<.*?>')
nonalpha_re = re.compile(r'[^a-zA-Z0-9\s]')

def clean_text(text, do_stemming=True):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = html_re.sub(' ', text)
    text = url_re.sub(' ', text)
    text = nonalpha_re.sub(' ', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    if do_stemming:
        tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# Apply to dataframe:
df['clean'] = df['content'].apply(clean_text)
df['clean']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,clean
0,donald trump send embarrass new year eve messa...
1,drunk brag trump staffer start russian collus ...
2,sheriff david clark becom internet joke threat...
3,trump obsess even obama name code websit imag ...
4,pope franci call donald trump christma speech ...
...,...
44893,fulli commit nato back new approach afghanista...
44894,lexisnexi withdrew two product chines market l...
44895,minsk cultur hub becom author minsk reuter sha...
44896,vatican upbeat possibl pope franci visit russi...


#**Tokenization**

In [None]:
VOCAB_SIZE = 20000   # paper used 20000 input-dim for embedding. :contentReference[oaicite:11]{index=11}
MAXLEN = 32          # paper used input_length 32. :contentReference[oaicite:12]{index=12}
EMBEDDING_DIM = 200  # paper used glove.6B 200d. :contentReference[oaicite:13]{index=13}

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean'].tolist())

sequences = tokenizer.texts_to_sequences(df['clean'].tolist())
X = pad_sequences(sequences, maxlen=MAXLEN, padding='post', truncating='post')
y = df['label'].values

# stratified 80/20 split as used in paper. :contentReference[oaicite:14]{index=14}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)


# **Build Embedding Matrix**

In [None]:
# download glove.6B.200d.txt manually and place path here, or use Colab to wget it.
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
GLOVE_PATH = 'glove.6B.200d.txt'  # put the file in working dir

def load_glove(glove_path):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in tqdm(f, desc='loading glove'):
            parts = line.split()
            word = parts[0]
            coefs = np.asarray(parts[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

emb_index = load_glove(GLOVE_PATH)

word_index = tokenizer.word_index
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= num_words: continue
    vec = emb_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec


--2025-08-26 17:26:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-08-26 17:26:29--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-08-26 17:26:30--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

loading glove: 400000it [00:26, 15248.88it/s]


# **Building OPCNN Model Factory**

In [None]:
def build_opcnn(vocab_size=num_words, embedding_dim=EMBEDDING_DIM, maxlen=MAXLEN,
                embedding_matrix=embedding_matrix, filters=128, kernel_size=3, pool_size=2, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        input_length=maxlen,
                        weights=[embedding_matrix],
                        trainable=False))  # paper used pretrained glove; you can choose trainable=True to fine-tune
    model.add(Dropout(dropout_rate))
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # paper used sigmoid + Adam. :contentReference[oaicite:16]{index=16}
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

# quick test instantiate
model = build_opcnn()
model.summary()




In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# search space (example)
space = {
    'filters': hp.choice('filters', [64, 128, 256]),
    'kernel': hp.choice('kernel', [2,3,4,5]),
    'pool': hp.choice('pool', [2,3]),
    'dropout': hp.uniform('dropout', 0.1, 0.9),    # paper used dropout search range 0.1-0.9. :contentReference[oaicite:17]{index=17}
    'batch_size': hp.choice('batch_size', [32,64]),
    'epochs': hp.choice('epochs', [5,10,15])      # keep small for demo; increase for real runs
}

def objective(params):
    tf.keras.backend.clear_session()
    model = build_opcnn(filters=params['filters'],
                        kernel_size=params['kernel'],
                        pool_size=params['pool'],
                        dropout_rate=params['dropout'])
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)
    h = model.fit(X_train, y_train,
                  validation_split=0.1,
                  epochs=params['epochs'],
                  batch_size=params['batch_size'],
                  callbacks=[es],
                  verbose=0)
    val_loss = min(h.history['val_loss'])
    # we minimize validation loss
    return {'loss': val_loss, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials, rstate=np.random.default_rng(SEED))
print('best:', best)

100%|██████████| 20/20 [39:38<00:00, 118.92s/trial, best loss: 0.0051206243224442005]
best: {'batch_size': np.int64(0), 'dropout': np.float64(0.31465980296843005), 'epochs': np.int64(0), 'filters': np.int64(1), 'kernel': np.int64(0), 'pool': np.int64(0)}


# **Train Final Model**

In [None]:
# Map hyperopt indexes back if you used hp.choice above (example mapping depends on your defined choices)
# Example: if best['filters']==1 -> actual_filters = [64,128,256][1]
filters_list=[64,128,256]
kernel_list=[2,3,4,5]
pool_list=[2,3]
batch_list=[32,64]
epoch_list=[5,10,15]

best_filters = filters_list[best['filters']]
best_kernel = kernel_list[best['kernel']]
best_pool = pool_list[best['pool']]
best_dropout = float(best['dropout'])
best_batch = batch_list[best['batch_size']]
best_epochs = epoch_list[best['epochs']]

final_model = build_opcnn(filters=best_filters, kernel_size=best_kernel, pool_size=best_pool, dropout_rate=best_dropout)

es = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
final_model.fit(X_train, y_train, validation_split=0.1, epochs=best_epochs, batch_size=best_batch, callbacks=[es], verbose=1)

# evaluate on test
y_prob = final_model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print('Accuracy', accuracy_score(y_test, y_pred))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred))
print('F1', f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Epoch 1/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.9636 - loss: 0.1034 - val_accuracy: 0.9986 - val_loss: 0.0072
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14ms/step - accuracy: 0.9979 - loss: 0.0084 - val_accuracy: 0.9989 - val_loss: 0.0055
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 16ms/step - accuracy: 0.9990 - loss: 0.0037 - val_accuracy: 0.9989 - val_loss: 0.0056
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17ms/step - accuracy: 0.9993 - loss: 0.0022 - val_accuracy: 0.9986 - val_loss: 0.0071
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 18ms/step - accuracy: 0.9997 - loss: 0.0011 - val_accuracy: 0.9986 - val_loss: 0.0068
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy 0.9991091314031181
Precision 0.9995327102803738
Recall 0.9985994397759104
F1 0.999065857076

In [None]:
final_model.save('opcnn_fake_model.h5')
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())


