In [None]:
#import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import string
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text
from nltk.stem.snowball import SnowballStemmer # stemmes words

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=01f98e43caaa03eca2001ae9595577c140f96b307d0333f001899c2fda864f32
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import fasttext

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_fake = pd.read_csv('/content/drive/MyDrive/ISOT/Fake.csv')
data_true = pd.read_csv('/content/drive/MyDrive/ISOT/True.csv')

In [None]:
data_fake["class"] = 0
data_true["class"] = 1

In [None]:
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

In [None]:
data_merge = pd.concat([data_fake, data_true], axis = 0)
data_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [None]:
data = data_merge.drop(['title', 'subject', 'date'], axis = 1)

In [None]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [None]:
data = data.sample(frac = 1)

In [None]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [None]:
data.head()

Unnamed: 0,text,class
0,"Just a day before the Iowa caucuses, a former ...",0
1,Judicial Watch has been the one organization t...,0
2,WASHINGTON (Reuters) - If U.S. President Donal...,1
3,"If there s one takeaway from Thursday, it s th...",0
4,((This Sept. 12 story corrects name of sacked...,1


In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]','', text) #remove text enclosed in square brackets, including the brackets themselves
    text = re.sub("\\W", " ", text) #replaces all non-word characters (e.g., punctuation, special characters, symbols) with spaces
    text = re.sub('https?://\S+|www\.\S+', '', text) # removes URLs from the text by matching and removing both HTTP/HTTPS URLs and "www" URLs
    text = re.sub('<.*?>+', '', text) #remove HTML tags and their contents
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #remove all punctuation characters
    text = re.sub('\n', '', text) #removes newline characters, which are typically used to represent line breaks or paragraphs in text
    text = re.sub('\w*\d\w', '', text) #removes words containing numbers or alphanumeric patterns
    return text

In [None]:
data['text'] = data['text'].apply(wordopt)

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')#to getting alpha only
stemmer = SnowballStemmer("english")
data['text'] = data.text.map(lambda t: tokenizer.tokenize(t))
data['text'] = data.text.map(lambda l: [stemmer.stem(word) for word in l])
data['text'] = data.text.map(lambda l: ' '.join(l))

In [None]:
data['class_label'] = "__label__" + data['class'].astype(str)
data.head()

Unnamed: 0,text,class,class_label
0,just a day befor the iowa caucus a former fiel...,0,__label__0
1,judici watch has been the one organ that s bee...,0,__label__0
2,washington reuter if u s presid donald trump t...,1,__label__1
3,if there s one takeaway from thursday it s tha...,0,__label__0
4,this sept stori correct name of sack minist in...,1,__label__1


In [None]:
data['class_text'] = data['class_label'] + " " + data['text']
data.head()

Unnamed: 0,text,class,class_label,class_text
0,just a day befor the iowa caucus a former fiel...,0,__label__0,__label__0 just a day befor the iowa caucus a ...
1,judici watch has been the one organ that s bee...,0,__label__0,__label__0 judici watch has been the one organ...
2,washington reuter if u s presid donald trump t...,1,__label__1,__label__1 washington reuter if u s presid don...
3,if there s one takeaway from thursday it s tha...,0,__label__0,__label__0 if there s one takeaway from thursd...
4,this sept stori correct name of sack minist in...,1,__label__1,__label__1 this sept stori correct name of sac...


In [None]:
random_seed = 42

In [None]:
train, test = train_test_split(data, test_size = 0.25, random_state=random_seed)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.25, random_state=random_seed)

In [None]:
train.to_csv("news.train", columns=["class_text"], index=False, header=False)
test.to_csv("news.test", columns=["class_text"], index=False, header=False)

In [None]:
# model = fasttext.train_supervised(input="news.train")
# model.test("news.test")
model = fasttext.train_supervised(input="news.train")
model.test("news.test")

(11225, 0.9899331848552338, 0.9899331848552338)

In [None]:
model.save_model("news_model.bin")

In [None]:
model.predict('Donald Trump spent a good portion of his day at his golf club, marking the 84th day he s done so since taking the oath of office. It must have been a bad game because just after that, Trump lashed out at FBI Deputy Director Andrew McCabe on Twitter following a report saying McCabe plans to retire in a few months. The report follows McCabe s testimony in front of congressional committees this week, as well as mounting criticism from Republicans regarding the Russia probe.So, naturally, Trump attacked McCabe with a lie. How can FBI Deputy Director Andrew McCabe, the man in charge, along with leakin  James Comey, of the Phony Hillary Clinton investigation (including her 33,000 illegally deleted emails) be given $700,000 for wife s campaign by Clinton Puppets during investigation?  Trump tweeted.How can FBI Deputy Director Andrew McCabe, the man in charge, along with leakin  James Comey, of the Phony Hillary Clinton investigation (including her 33,000 illegally deleted emails) be given $700,000 for wife s campaign by Clinton Puppets during investigation?  Donald J. Trump (@realDonaldTrump) December 23, 2017He didn t stop there.FBI Deputy Director Andrew McCabe is racing the clock to retire with full benefits. 90 days to go?!!!  Donald J. Trump (@realDonaldTrump) December 23, 2017Wow,  FBI lawyer James Baker reassigned,  according to @FoxNews.  Donald J. Trump (@realDonaldTrump) December 23, 2017With all of the Intel at Trump s disposal, he s getting his information from Fox News. McCabe spent most of his career in the fight against terrorism and now he s being attacked by the so-called president. Trump has been fact-checked before on his claim of his wife receiving $700,000 for her campaign.Politifact noted in late July that Trump s  tweet about Andrew McCabe is a significant distortion of the facts. And the implication that McCabe got Clinton off as a political favor doesn t make much sense when we look at the evidence. His July tweet was rated  mostly false.  But Trump repeats these lies because he knows his supporters will believe them without bothering to Google. It s still a lie, though.Photo by Zach Gibson   Pool/Getty Images.')

(('__label__0',), array([0.99989188]))

In [None]:
# Extract vocabulary and embeddings
vocabulary = model.get_words()
word_embeddings = np.array([model.get_word_vector(word) for word in vocabulary])

In [None]:
# Create the embedding matrix
embedding_dim = model.get_dimension()  # Dimension of FastText embeddings
embedding_matrix = np.zeros((len(vocabulary), embedding_dim))
for i, word in enumerate(vocabulary):
    embedding_matrix[i] = model.get_word_vector(word)

In [None]:
# Save the embedding matrix to a file
np.save('embedding_matrix.npy', embedding_matrix)  #file is in the NumPy array format as .npy

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
from keras.layers import Dropout, Flatten


In [None]:
text_column = data['text']

# Concatenate all the text data into a single string
all_text = ' '.join(text_column)

# Split the text into words and count them
word_count = len(all_text.split())
print(word_count)

18406124


In [None]:
max_sequence_length = 0
for text in text_column:
    tokens = tokenizer.tokenize(text)
    sequence_length = len(tokens)
    max_sequence_length = max(max_sequence_length, sequence_length)

In [None]:
print(max_sequence_length)

0


In [None]:
vocab_size = len(vocabulary)
sequence_length = max_sequence_length

NameError: ignored

In [None]:
embedding_matrix.shape

(75152, 100)

In [None]:
print(type(x_train))

<class 'pandas.core.series.Series'>


In [None]:
# Convert Pandas Series to NumPy arrays
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [None]:
# Check the data types and shapes to verify they are suitable
print("x_train type:", type(x_train))
print("x_train shape:", x_train.shape)
print("y_train type:", type(y_train))
print("y_train shape:", y_train.shape)
print("x_test type:", type(x_test))
print("x_test shape:", x_test.shape)
print("y_test type:", type(y_test))
print("y_test shape:", y_test.shape)

x_train type: <class 'numpy.ndarray'>
x_train shape: (33673,)
y_train type: <class 'numpy.ndarray'>
y_train shape: (33673,)
x_test type: <class 'numpy.ndarray'>
x_test shape: (11225,)
y_test type: <class 'numpy.ndarray'>
y_test shape: (11225,)


In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)  # Assuming x_train is a list of text strings
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

# Pad sequences
x_train_padded = pad_sequences(x_train_sequences, maxlen=sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=sequence_length)

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Concatenate, GlobalMaxPooling1D, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# Input layer
input_layer = Input(shape=(sequence_length,))

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)

# Convolutional layers
conv1d = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
conv1d_1 = Conv1D(filters=128, kernel_size=4, activation='relu')(embedding)
conv1d_2 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding)

# MaxPooling layers
max_pooling1d = MaxPooling1D(pool_size=2)(conv1d)
max_pooling1d_1 = MaxPooling1D(pool_size=2)(conv1d_1)
max_pooling1d_2 = MaxPooling1D(pool_size=2)(conv1d_2)

# Concatenate max-pooled layers
concatenated = Concatenate(axis=1)([max_pooling1d, max_pooling1d_1, max_pooling1d_2])

# Additional Convolutional and MaxPooling layers
conv1d_3 = Conv1D(filters=128, kernel_size=3, activation='relu')(concatenated)
max_pooling1d_3 = MaxPooling1D(pool_size=2)(conv1d_3)
conv1d_4 = Conv1D(filters=128, kernel_size=3, activation='relu')(max_pooling1d_3)
max_pooling1d_4 = MaxPooling1D(pool_size=2)(conv1d_4)

# GlobalMaxPooling layer
global_max_pooling1d = GlobalMaxPooling1D()(max_pooling1d_4)

# Dense layers
dense = Dense(128, activation='relu')(global_max_pooling1d)
output_layer = Dense(1, activation='sigmoid')(dense)  # Assuming binary classification

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model (specify loss, optimizer, and metrics)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define model checkpoint
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

# Print a summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 8280)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 8280, 100)            7515200   ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 8278, 128)            38528     ['embedding[0][0]']           
                                                                                                  
 conv1d_1 (Conv1D)           (None, 8277, 128)            51328     ['embedding[0][0]']           
                                                                                              

In [None]:
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=64, callbacks=[early_stopping, model_checkpoint])

Epoch 1/10
Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d1759fdfc70>

In [None]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.004810821730643511, Test Accuracy: 0.9988418817520142


In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+') #to getting alpha only
stemmer = SnowballStemmer("english")

# Tokenize and preprocess the new input data
new_input_data = [ "BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago Breaking News By Amy Moreno October 28, 2016. Once again, Trump was right. Back in August, in a statement regarding Hillaryâ€™s carelessness handling classified documents, Trump stated that he was concerned that Weiner had â€œaccessâ€ to information he shouldnâ€™t. Now that weâ€™re learning that the FBI discovered â€œnew emailsâ€ on a â€œdeviceâ€ associated to Weiner, it looks as if Trump was right AGAIN. â€” Deplorable AJ (@asamjulian) October 28, 2016 This is a movement â€“ we are the political OUTSIDERS fighting against the FAILED GLOBAL ESTABLISHMENT! Join the resistance and help us fight to put America First! Amy Moreno is a Published Author , Pug Lover & Game of Thrones Nerd. You can follow her on Twitter here and Facebook here . Support the Trump Movement and help us fight Liberal Media Bias. Please LIKE and SHARE this story on Facebook or Twitter.  "]
new_sequences = [wordopt(sentence) for sentence in new_input_data]

from keras.preprocessing.sequence import pad_sequences
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(new_sequences)  # Assuming x_train is a list of text strings
new_sequences = tokenizer1.texts_to_sequences(new_sequences)

# Pad sequences
new_sequences = pad_sequences(new_sequences, maxlen=sequence_length)

# new_sequences = [map_tokens_to_indices(sequence, embedding_matrix, sequence_length) for sequence in new_sequences]

# Convert 'new_sequences' to a NumPy array
new_data = np.array(new_sequences)

# Pad or truncate the sequences to match the sequence length
new_data = pad_sequences(new_data, maxlen=sequence_length, padding='post', truncating='post')

# Make predictions on the new data
predictions = model.predict(new_data)

# The 'predictions' array will contain probability scores for each class (0 and 1)
# You can convert these scores to class labels based on a threshold (e.g., 0.5)
predicted_labels = [1 if score >= 0.5 else 0 for score in predictions]

class_mapping = {0: "fake", 1: "true"}

# Use the mapping to transform the predicted labels
predicted_class_names = [class_mapping[label] for label in predicted_labels]

print(predicted_class_names)


['true']


In [None]:
from tensorflow.keras.models import load_model

# Load your trained CNN model
model1 = load_model('/content/best_model.h5')

In [None]:
# tokenizer = RegexpTokenizer(r'[A-Za-z]+') #to getting alpha only
# stemmer = SnowballStemmer("english")
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize and preprocess the new input data
new_input_data = ["The fallout from Ryan Lochteâ€™s story about being robbed at gunpoint in Rio  â€”   a tale the Brazilian police said was not true  â€”   continued Monday when four companies said they would end business partnerships with Mr. Lochte, an American swimmer and   Olympic medalist. After a week of intense international media attention and anger in Brazil, the financial repercussions were swift for Mr. Lochte as Speedo USA, the luxury clothing retailer Ralph Lauren and the mattress company Airweave all announced that they would part ways with him. And Syneron Candela, a company that sells   devices, told Reuters its relationship with the swimmer ended on Sunday. Speedo USA said in a message on Twitter that it would instead donate a $50, 000 portion of Mr. Lochteâ€™s fee to a charity to help Brazilian children. â€œWhile we have enjoyed a winning relationship with Ryan for over a decade and he has been an important member of the Speedo team, we cannot condone behavior that is counter to the values this brand has long stood for,â€ the company said in its statement. On Monday, Kim Angelastro, a spokeswoman for Syneron Candela, wrote in an email, â€œWe hold our employees to high standards, and we expect the same of our business partners. â€ Mr. Lochte was a spokesman for the companyâ€™s Gentle Hair Removal brand. Through a spokeswoman, Ralph Lauren said Monday that Mr. Lochteâ€™s endorsement agreement with the clothing company had been for only the 2016 Olympics, and that his contract would not be renewed. Airweave said on Twitter that â€œafter careful consideration, we have made the decision to end our partnership with Ryan Lochte. â€ The decisions to cut ties with Mr. Lochte, 32, were the first major signs of the financial fallout for him. For the past week, he has been at the center of an international firestorm after the Brazilian police said he and three other American athletes  â€”   Jimmy Feigen, Jack Conger and Gunnar Bentz  â€”   had fabricated the account of being robbed after a   party in Rio de Janeiro. The authorities said that the swimmers had instead drunkenly vandalized a gas station bathroom, paying a security guard about $50 for the damage before leaving. Mr. Lochte originally said that the car they were traveling in had been stopped by armed men, who held a gun to his head. But his story later changed. A Brazilian judge ordered the swimmers to remain in Rio, but Mr. Lochte had already left the country. After Mr. Conger and Mr. Bentz were pulled from their flight to the United States, they told the police that the confrontation began when Lochte pulled a poster off a gas station wall. Mr. Feigen, 26, later donated $10, 800 to a charity in Rio that teaches martial arts to poor children. Mr. Lochte first issued an apology on social media  â€”   â€œI should have been much more responsible for how I handled myself,â€ he wrote  â€”   then told Matt Lauer in an interview on NBC that he had been intoxicated and that he had â€œoverexaggerated that story. â€ He has maintained that he was held at gunpoint. â€œAll we know is that there was a gun pointed in our direction, and we were demanded to give money,â€ Mr. Lochte said. Mr. Lochte, whose boyish and sometimes oafish personality had made him a commercial success in Olympics past, had headed into Rio with fewer sponsors than heâ€™d had at the London Games, according to a report by CNN Money. Mr. Lochte took home a gold medal in the   freestyle relay in Rio.", "Republicans are working overtime trying to sell their scam of a tax bill to the public as something that directly targets middle-class and working-class families with financial relief. Nothing could be further from the truth, and they re getting hammered on that repeatedly. Speaking on CNBC, Paul Ryan was going full throttle, trying to convince us that the paltry savings we re getting is actually wait for it big money.But he didn t just go with the usual talking points. With a smug look that only someone who grew up in a wealthy family can muster when talking about that which he does not know, Ryan claimed that the $2,059 more per year that families living paycheck-to-paycheck will see is extremely significant. Then he decided he had to amend that to say such savings might be nothing to a family earning $600,000 per year (true), or for people living in New York or California (false).Those are the same two states that Trump s loyal subjects insist on stripping from the 2016 vote totals to claim that Trump actually won the popular vote. Watch Ryan completely dismiss all the struggling families living in blue states below:If you re living paycheck-to-paycheck which is more than half of the people in this country and you got #2059more from a tax cut next year, that s not nothing. pic.twitter.com/8TKtrMqRa1  Paul Ryan (@SpeakerRyan) December 21, 2017Someone needs to reach through their computer or television and wipe that smugness off his face. It is the height of arrogance and insult to imply that there are no struggling families in either of those two states.Featured image via Mark Wilson/Getty Images","BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago Breaking News By Amy Moreno October 28, 2016. Once again, Trump was right. Back in August, in a statement regarding Hillaryâ€™s carelessness handling classified documents, Trump stated that he was concerned that Weiner had â€œaccessâ€ to information he shouldnâ€™t. Now that weâ€™re learning that the FBI discovered â€œnew emailsâ€ on a â€œdeviceâ€ associated to Weiner, it looks as if Trump was right AGAIN. â€” Deplorable AJ (@asamjulian) October 28, 2016 This is a movement â€“ we are the political OUTSIDERS fighting against the FAILED GLOBAL ESTABLISHMENT! Join the resistance and help us fight to put America First! Amy Moreno is a Published Author , Pug Lover & Game of Thrones Nerd. You can follow her on Twitter here and Facebook here . Support the Trump Movement and help us fight Liberal Media Bias. Please LIKE and SHARE this story on Facebook or Twitter.  "]
new_sequences = [wordopt(sentence) for sentence in new_input_data]

from keras.preprocessing.sequence import pad_sequences
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(new_sequences)  # Assuming x_train is a list of text strings
new_sequences = tokenizer1.texts_to_sequences(new_sequences)

# Pad sequences
new_sequences = pad_sequences(new_sequences, maxlen=sequence_length)

# new_sequences = [map_tokens_to_indices(sequence, embedding_matrix, sequence_length) for sequence in new_sequences]

# Convert 'new_sequences' to a NumPy array
new_data = np.array(new_sequences)

# Pad or truncate the sequences to match the sequence length
new_data = pad_sequences(new_data, maxlen=sequence_length, padding='post', truncating='post')

# Make predictions on the new data
predictions = model1.predict(new_data)

# The 'predictions' array will contain probability scores for each class (0 and 1)
# You can convert these scores to class labels based on a threshold (e.g., 0.5)
predicted_labels = [1 if score >= 0.5 else 0 for score in predictions]

class_mapping = {0: "fake", 1: "true"}

# Use the mapping to transform the predicted labels
predicted_class_names = [class_mapping[label] for label in predicted_labels]

print(predicted_class_names)


NameError: ignored

In [None]:
import pickle
import joblib

# Load the pickled model from a file using joblib
loaded_model = joblib.load('/content/cnnfasttext_model.pkl')
# Load the pickled model from a file
# with open('/content/cnnfasttext_model.pkl', 'rb') as file:
#     loaded_model = pickle.load(file)

# Now, you can use the loaded_model for predictions
new_input_data = [ "BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago BREAKING : Trump Expressed Concern Over Anthony Weinerâ€™s â€œIllegal Accessâ€ to Classified Info 2 Months ago Breaking News By Amy Moreno October 28, 2016. Once again, Trump was right. Back in August, in a statement regarding Hillaryâ€™s carelessness handling classified documents, Trump stated that he was concerned that Weiner had â€œaccessâ€ to information he shouldnâ€™t. Now that weâ€™re learning that the FBI discovered â€œnew emailsâ€ on a â€œdeviceâ€ associated to Weiner, it looks as if Trump was right AGAIN. â€” Deplorable AJ (@asamjulian) October 28, 2016 This is a movement â€“ we are the political OUTSIDERS fighting against the FAILED GLOBAL ESTABLISHMENT! Join the resistance and help us fight to put America First! Amy Moreno is a Published Author , Pug Lover & Game of Thrones Nerd. You can follow her on Twitter here and Facebook here . Support the Trump Movement and help us fight Liberal Media Bias. Please LIKE and SHARE this story on Facebook or Twitter.  "]

predictions = loaded_model.predict(new_input_data)


KeyError: ignored

try1


In [None]:
from keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, BatchNormalization, AveragePooling1D
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

# ... other parts of the code remain the same ...
input_layer = Input(shape=(sequence_length,))  # Replace 'sequence_length' with the desired sequence length

# Define embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)

# Define convolutional layers
conv_layers = []
filter_sizes = [3, 4, 5] # Example filter sizes
for filter_size in filter_sizes:
    conv_layer = Conv1D(filters=128, kernel_size=filter_size, activation='relu', padding='same')(embedding_layer)
    maxpool_layer = MaxPooling1D(pool_size=sequence_length - filter_size +1)(conv_layer)
    conv_layers.append(maxpool_layer)

# Concatenate the max-pooled layers
concatenated = Concatenate(axis=1)(conv_layers)

# Apply global max-pooling
global_maxpool = GlobalMaxPooling1D()(concatenated)

# Add a BatchNormalization layer
batchnorm_layer = BatchNormalization()(global_maxpool)

# Define a dense layer
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(batchnorm_layer)
dense_layer = Dropout(0.5)(dense_layer)

# Define the output layer
output_layer = Dense(1, activation='softmax')(dense_layer)  # 'num_classes' is the number of classes

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model (specify loss, optimizer, and metrics)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print a summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 8280)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 8280, 100)            7507500   ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 8280, 128)            38528     ['embedding[0][0]']           
                                                                                                  
 conv1d_1 (Conv1D)           (None, 8280, 128)            51328     ['embedding[0][0]']           
                                                                                              

In [None]:
# Train the model
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fae6825a2f0>

In [None]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.03804456815123558, Test Accuracy: 0.4705567955970764


try2

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Concatenate, GlobalMaxPooling1D, Dense

# Input layer
input_layer = Input(shape=(sequence_length,))

# Embedding layer
embedding = Embedding(input_dim=300, output_dim=300)(input_layer)

# Convolutional layers
conv1d = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
conv1d_1 = Conv1D(filters=128, kernel_size=4, activation='relu')(embedding)
conv1d_2 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding)

# MaxPooling layers
max_pooling1d = MaxPooling1D(pool_size=2)(conv1d)
max_pooling1d_1 = MaxPooling1D(pool_size=2)(conv1d_1)
max_pooling1d_2 = MaxPooling1D(pool_size=2)(conv1d_2)

# Concatenate max-pooled layers
concatenated = Concatenate(axis=1)([max_pooling1d, max_pooling1d_1, max_pooling1d_2])

# Additional Convolutional and MaxPooling layers
conv1d_3 = Conv1D(filters=128, kernel_size=3, activation='relu')(concatenated)
max_pooling1d_3 = MaxPooling1D(pool_size=2)(conv1d_3)
conv1d_4 = Conv1D(filters=128, kernel_size=3, activation='relu')(max_pooling1d_3)
max_pooling1d_4 = MaxPooling1D(pool_size=2)(conv1d_4)

# GlobalMaxPooling layer
global_max_pooling1d = GlobalMaxPooling1D()(max_pooling1d_4)

# Dense layers
dense = Dense(128, activation='relu')(global_max_pooling1d)
output_layer = Dense(1, activation='sigmoid')(dense)  # Assuming binary classification

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model (specify loss, optimizer, and metrics)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print a summary of the model
model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 8280)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 8280, 300)            90000     ['input_3[0][0]']             
                                                                                                  
 conv1d_8 (Conv1D)           (None, 8278, 128)            115328    ['embedding_2[0][0]']         
                                                                                                  
 conv1d_9 (Conv1D)           (None, 8277, 128)            153728    ['embedding_2[0][0]']         
                                                                                            

In [None]:
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fae7a87eb00>

In [None]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.006609044503420591, Test Accuracy: 0.9989309310913086


In [None]:
input_layer = Input(shape=(sequence_length,))  # Replace 'sequence_length' with the desired sequence length

# Define embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)

# Define convolutional layers
conv_layers = []
filter_sizes = [3, 4, 5]  # Example filter sizes
for filter_size in filter_sizes:
    conv_layer = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedding_layer)
    maxpool_layer = MaxPooling1D(pool_size=sequence_length - filter_size +1)(conv_layer)
    conv_layers.append(maxpool_layer)

# Concatenate the max-pooled layers
concatenated = Concatenate(axis=1)(conv_layers)

# Apply global max-pooling
global_maxpool = GlobalMaxPooling1D()(concatenated)

# Define a dense layer
dense_layer = Dense(128, activation='relu')(global_maxpool)
dense_layer = Dropout(0.5)(dense_layer)

# Define the output layer
output_layer = Dense(1, activation='softmax')(dense_layer)  # 'num_classes' is the number of classes

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model (specify loss, optimizer, and metrics)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print a summary of the model
model.summary()

Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_37 (InputLayer)       [(None, 8280)]               0         []                            
                                                                                                  
 embedding_36 (Embedding)    (None, 8280, 100)            7483400   ['input_37[0][0]']            
                                                                                                  
 conv1d_80 (Conv1D)          (None, 8278, 128)            38528     ['embedding_36[0][0]']        
                                                                                                  
 conv1d_81 (Conv1D)          (None, 8277, 128)            51328     ['embedding_36[0][0]']        
                                                                                           

In [None]:

model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=64, callbacks=[early_stopping, model_checkpoint])

NameError: ignored

In [None]:
history = model.fit(x_train, y_train, batch_size = 256 , epochs = 5)

Epoch 1/5


UnimplementedError: ignored

In [None]:
  def cnn_net1():
    model = Sequential()

    #Non-trainable embeddidng layer
    model.add(Embedding(vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=sequence_length, trainable=False))

    model.add(Dropout(0.2))
    model.add(Conv1D(filters=128, kernel_size=, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.2))
    model.add(Dense(units = 250 , activation = 'relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
model = cnn_net1()

batch_size = 256
epochs = 8

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1000, 100)         7524100   
                                                                 
 dropout_6 (Dropout)         (None, 1000, 100)         0         
                                                                 
 conv1d_5 (Conv1D)           (None, 1000, 128)         12928     
                                                                 
 global_max_pooling1d_3 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 250)               32250     
                                                      

In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define model checkpoint
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)


In [None]:
sequence_length = 100  # Adjust as per your requirements

# Function to map tokens to indices, handle OOV tokens
def map_tokens_to_indices(tokens, embedding_model, sequence_length):
    indices = []
    for token in tokens:
        if token in embedding_model:
            indices.append(embedding_model.get_word_id(token))
        else:
            # Handle out-of-vocabulary (OOV) tokens
            indices.append(0)  # You can use index 0 for OOV tokens or another suitable index
    # Ensure the sequence has the desired length
    if len(indices) < sequence_length:
        indices += [0] * (sequence_length - len(indices))
    return indices

# Map tokens to indices for each data point
x_data = data['text'].apply(lambda x: map_tokens_to_indices(x.split(), embedding_matrix, sequence_length))

# Convert the list of indices to a NumPy array
x_data = np.array(x_data.tolist())



  if token in embedding_model:
  x_data = np.array(x_data.tolist())


In [None]:
# Split your data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_data, data['class'], test_size=0.2, random_state=42)

In [None]:
# Define the CNN model
model = Sequential()
model.add(Embedding(
    input_dim=embedding_matrix.shape[0],  # Vocabulary size
    output_dim=embedding_matrix.shape[1],  # Embedding dimension
    weights=[embedding_matrix],  # Use your FastText embedding matrix
    input_length=sequence_length,
    trainable=True  # Set to True if you want to fine-tune embeddings
))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
x_train = np.array(x_train)
x_test = np.array(x_test)

# Ensure that the sequences have the desired length
# If they are shorter than the desired sequence_length, you can pad them
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(x_train, maxlen=sequence_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=sequence_length, padding='post', truncating='post')

In [None]:
y_train = y_train.apply(lambda label: 0 if label == 'fake' else 1)
y_test = y_test.apply(lambda label: 0 if label == 'fake' else 1)

In [None]:
# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a07e9ed8970>

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 3.7673216732618184e-14, Test Accuracy: 1.0


In [None]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

tokenizer = RegexpTokenizer(r'[A-Za-z]+')#to getting alpha only
stemmer = SnowballStemmer("english")
# Load your trained CNN model
# model = load_model('your_trained_model.h5')

def preprocess_text(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Apply stemming to each token
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # Join the stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

def map_tokens_to_indices(sequence, embedding_matrix, sequence_length):
    # Implement the logic to map tokens to numerical indices
    indices = [embedding_matrix[token] for token in sequence]

    # Pad or truncate the indices to match the sequence_length
    if len(indices) < sequence_length:
        # Pad with zeros
        indices += [0] * (sequence_length - len(indices))
    elif len(indices) > sequence_length:
        # Truncate the sequence
        indices = indices[:sequence_length]

    return indices


# Tokenize and preprocess the new input data
new_input_data = ["NEW YORK/WASHINGTON (Reuters) - The new U.S. tax code targets high-tax states and may be unconstitutional, New York Governor Andrew ", "The number of cases of cops brutalizing and killing people of color seems to see no end. Now, we have another case that needs to be shared far and wide. An Alabama woman by the name of Angela Williams shared a graphic photo of her son, lying in a hospital bed with a beaten and fractured face, on Facebook. It needs to be shared far and wide, because this is unacceptable.It is unclear why Williams  son was in police custody or what sort of altercation resulted in his arrest, but when you see the photo you will realize that these details matter not. Cops are not supposed to beat and brutalize those in their custody. In the post you are about to see, Ms. Williams expresses her hope that the cops had their body cameras on while they were beating her son, but I think we all know that there will be some kind of convenient  malfunction  to explain away the lack of existence of dash or body camera footage of what was clearly a brutal beating. Hell, it could even be described as attempted murder. Something tells me that this young man will never be the same. Without further ado, here is what Troy, Alabama s finest decided was appropriate treatment of Angela Williams  son:No matter what the perceived crime of this young man might be, this is completely unacceptable. The cops who did this need to rot in jail for a long, long time   but what you wanna bet they get a paid vacation while the force  investigates  itself, only to have the officers returned to duty posthaste?This, folks, is why we say BLACK LIVES MATTER. No way in hell would this have happened if Angela Williams  son had been white. Please share far and wide, and stay tuned to Addicting Info for further updates.Featured image via David McNew/Stringer/Getty Images"]
new_sequences = [preprocess_text(sentence) for sentence in new_input_data]
sequence_length = 100  # Make sure it matches your training data

new_sequences = [map_tokens_to_indices(sequence, embedding_matrix, sequence_length) for sequence in new_sequences]

# Convert 'new_sequences' to a NumPy array
new_data = np.array(new_sequences)

# Pad or truncate the sequences to match the sequence length
new_data = pad_sequences(new_data, maxlen=sequence_length, padding='post', truncating='post')


# Make predictions on the new data
predictions = model.predict(new_data)

# The 'predictions' array will contain probability scores for each class (0 and 1)
# You can convert these scores to class labels based on a threshold (e.g., 0.5)
predicted_labels = [1 if score >= 0.5 else 0 for score in predictions]

class_mapping = {0: "fake", 1: "true"}

# Use the mapping to transform the predicted labels
predicted_class_names = [class_mapping[label] for label in predicted_labels]

print(predicted_class_names)

IndexError: ignored