In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
vocab_size = 10000

2025-03-25 15:46:15.798511: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/bishwayansaha99/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bishwayansaha99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("/home/bishwayansaha99/langchain/docs/evaluation.csv", sep=";")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1
1,1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1
2,2,Oregon Cop Convicted Of Shattering Biker’s Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0


In [3]:
print(f"Dataset dimensions {df.shape}")

# to check if there is any null value present
print(f"\nNo. of null values in each column\n {df.isnull().sum()}")

# to check if the dataset is balanced
print(f"\nBalance of the dataset w.r.t no of fake or rela news \n{df['label'].value_counts()}")

Dataset dimensions (8117, 4)

No. of null values in each column
 Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

Balance of the dataset w.r.t no of fake or rela news 
label
1    4314
0    3803
Name: count, dtype: int64


In [4]:
# Choosing dependent and independent features
X = df.drop(labels=["Unnamed: 0", "label"], axis = 1)
y = df['label']

print(f"Dimension of independent and dependent features: {X.shape} & {y.shape}")

Dimension of independent and dependent features: (8117, 2) & (8117,)


In [5]:
corpus = []
lemmatizer = WordNetLemmatizer()
X_copy = X.copy()
for msg in X_copy['title']:
    review = re.sub('^[a-zA-Z]', ' ', msg)
    review = review.lower().strip()
    review = [lemmatizer.lemmatize(word) for word in review.split() if word not in set(stopwords.words("english"))]
    corpus.append(" ".join(review))


In [6]:
corpus_one_hot = [one_hot(words, vocab_size) for words in corpus]

print(f"<{corpus[2]}> --- and its one hot representation is: \n{corpus_one_hot[2]}")

<oregon cop convicted shattering biker’s collarbone kick forgot dashcam (video)> --- and its one hot representation is: 
[2310, 360, 8323, 8918, 4802, 6665, 8602, 6485, 7060, 8802]


In [7]:
max_sentence_len = 0
for words in corpus:
    max_sentence_len = max(max_sentence_len, len(words.split(" ")))

embedded_docs = pad_sequences(sequences=corpus_one_hot, padding="post", maxlen=max_sentence_len)
print(embedded_docs)
print(embedded_docs.shape)

[[5135 5086 1644 ...    0    0    0]
 [6482  811 5771 ...    0    0    0]
 [2310  360 8323 ...    0    0    0]
 ...
 [2122 3996 6228 ...    0    0    0]
 [4806 6758 7243 ...    0    0    0]
 [2350 9351 8022 ...    0    0    0]]
(8117, 28)


In [8]:
# Creating model
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_vector_features ))
model.add(LSTM(100))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

In [9]:
#Converting dependent and independent variables into array to be fed in the model
print(embedded_docs.shape)
print(y.shape)

X = np.array(embedded_docs)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

(8117, 28)
(8117,)


In [10]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=54)

Epoch 1/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.6732 - loss: 0.5568 - val_accuracy: 0.9085 - val_loss: 0.2518
Epoch 2/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.9557 - loss: 0.1345 - val_accuracy: 0.9105 - val_loss: 0.2293
Epoch 3/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9868 - loss: 0.0517 - val_accuracy: 0.8875 - val_loss: 0.3147
Epoch 4/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.9927 - loss: 0.0288 - val_accuracy: 0.8879 - val_loss: 0.3208
Epoch 5/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9960 - loss: 0.0197 - val_accuracy: 0.8785 - val_loss: 0.5167
Epoch 6/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.9975 - loss: 0.0125 - val_accuracy: 0.8838 - val_loss: 0.4930
Epoch 7/10
[1m106/106

<keras.src.callbacks.history.History at 0x7f7e61f569f0>

In [12]:
print(tf.config.list_physical_devices('GPU'))
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

[]
Default GPU Device: 


In [13]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred < 0.5, 0, 1)
print(classification_report(y_pred, y_test))

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      1156
           1       0.90      0.90      0.90      1280

    accuracy                           0.89      2436
   macro avg       0.89      0.89      0.89      2436
weighted avg       0.89      0.89      0.89      2436



In [None]:
!cat /usr/local/cuda/version.txt

cat: /usr/local/cuda/version.txt: No such file or directory


In [None]:
tf.__version__

NameError: name 'tf' is not defined

In [14]:
!nvcc

/bin/bash: line 1: nvcc: command not found
