# Load data
### Link : https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?resource=download&select=True.csv

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn import set_config; set_config(display='diagram')
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import os
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential

from tensorflow.keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaspicot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
fake = pd.read_csv('./Fake.csv')
true = pd.read_csv('./True.csv')

FileNotFoundError: [Errno 2] No such file or directory: './Fake.csv'

In [None]:
fake.head()

# Preparing the data for preprocessing

In [None]:
# Creating True columns for both dataframes 1 for true and 0 for fake

true['true'] = 1
fake['true'] = 0

# Concatenate the two in one dataframe

data = pd.concat([fake, true])

# Reset the index
data.reset_index(drop = True, inplace = True)

In [None]:
# The data is balanced

data.true.value_counts() / len(data) *100

In [None]:
# Check true fake news along each subject
# From the chart we see that subject column isn't important so i'll drop it

sns.countplot(x = 'subject', hue = 'true', data = data)
plt.xticks(rotation = 'vertical')
plt.legend(loc = 'upper center');

In [None]:
# Merging text and title columns in one

data['text'] = data['title'] + " " + data['text']

# Creating a dataframe of text and true columns only (So after this point df is our dataframe)
df = data.loc[:, ['text', 'true']]

### Now, we have a dataframe that contains one feature 'text' and the target 'true', the next step is to clean the text column by removing puctuations, making all letters lower, removing digits, and strip from extra space.

# Preprocessing

In [None]:
def cleaning(sentence):
    
    # making all letters lower_case
    sentence = sentence.lower()
    
    # Removing numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # Removing punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') 
    
    sentence = sentence.strip()
    
    return sentence

# Apply the function on the dataframe using pd.map

df['text'] = df['text'].map(cleaning)

In [None]:
# Only for Colab, remove after
import nltk
nltk.download('punkt')

In [None]:
# Tokenize texts and remove stop words
stop_words = set(stopwords.words('english')) 

def prep(sentence) :
    sentence = word_tokenize(sentence)
    sentence = [w for w in sentence if not w in stop_words]
    return sentence

df['text'] = df['text'].map(prep)

In [None]:
# Take a look at the length of each text
# Here for evey row i'm getting its length(how many words it contains)

sns.histplot(df['text'].map(lambda x : len(x)))

In [None]:
sns.boxplot(df['text'].map(lambda x : len(x)))
plt.xticks([i*500 for i in range(10)]);

## We can see that most of the data has less than 500 words, so to avoid large and useless padding i'll remove rows that contain more

In [None]:
# Create a column for the number of words in each row

df['num_words'] = df['text'].map(lambda x : len(x))

# Number of rows with more than 500 words is 3125

len(df[df['num_words'] > 500])

# Getting rid of rows that have more than 1000 words

df = df[df['num_words'] <= 500]

In [None]:
# Dropping the num_words columns because i just used it to filter out data with large number of words

del df['num_words']

In [None]:
df.head()

 Up to this moment, we cleaned the data, so we have one feature 'text' which is a list of words and the target.
Now, i'll split the data into train and test, then i'll fit a tokenizer on the training set and transform training and testing sets. 

## Tokenization

In [None]:
# Splitting the data

X = df['text']             
y = df['true'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
X_train

In [None]:
# Initializing the tokenizer

tokenizer = Tokenizer()

# The tokenization learns a dictionary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set!
# This tokenization also lowercases your words, apply some filters, and so on - you can check the doc if you want

tokenizer.fit_on_texts(X_train)

# We apply the tokenization to the train and test set

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [None]:
# Calculating the number of different words in the training set

vocab_size = len(tokenizer.word_index)

In [None]:
vocab_size


## Padding to make the input of the same length

In [None]:
X_tr = pad_sequences(X_train_token, dtype='float32', padding='post', maxlen = 185)

X_te = pad_sequences(X_test_token, dtype='float32', padding='post', maxlen = 185)

In [None]:
X_tr.shape

# Creating our model

- Embedding layer whose input_dim is the size of your vocabulary + 1 to consider 0 that is added by padding, and whose output_dim is the size of the     embedding space you want to have
- RNN (SimpleRNN, LSTM, GRU) layer
- Dense layer
- Output layer

In [None]:
#Zein's model 

from tensorflow.keras import regularizers

reg_l1 = regularizers.L1(0.01)
reg_l2 = regularizers.L2(0.01)

model = Sequential([
    layers.Embedding(
    input_dim=vocab_size+1,
    output_dim= 30,
    mask_zero=True, ),
    layers.LSTM(10),
    layers.Dense(10, activation = 'relu',kernel_regularizer = reg_l1),
    layers.Dropout(rate=0.3),
    layers.Dense(8, activation = 'relu',kernel_regularizer = reg_l1),
    layers.Dropout(rate=0.3),
    layers.Dense(1, activation="sigmoid")
    

])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


In [None]:
# Early stopping and train the model

es = EarlyStopping(patience = 2)

model.fit(X_tr, y_train, callbacks = [es], epochs = 2)

In [None]:
model.evaluate(X_te, y_test)

# **Model try with a new dataset**

In [None]:
new_dataset = pd.read_csv('./Train.csv')


In [None]:
def changing_target(df):
  df.loc[df["Labels"] == 1, "category"] = int(0)
  df.loc[df["Labels"] == 5, "category"] = int(1)
  return df

In [None]:
df_with_target = changing_target(new_dataset)

In [None]:
df_cleaned = df_with_target.drop(columns='Labels')

In [None]:
sns.set_style("darkgrid")
sns.countplot(df_cleaned.category)

In [None]:
df_cleaned = df_cleaned.dropna() # drop the labels 2,3,4,0

In [None]:
# The data is balanced

df_cleaned.category.value_counts() / len(df_cleaned) *100

In [None]:
# Merging text and title columns in one

df_cleaned['Text'] = df_cleaned['Text_Tag'] + " " + df_cleaned['Text']



In [None]:
# Creating a dataframe of text and true columns only (So after this point df is our dataframe)
df_cleaned = df_cleaned.loc[:, ['Text', 'category']] 

In [None]:


# Apply the function on the dataframe using pd.map

df_cleaned['Text'] = df_cleaned['Text'].map(cleaning)

In [None]:
df_cleaned['Text'][12] ## it worked

In [None]:
df_cleaned['Text'] = df_cleaned['Text'].map(prep)

In [None]:
df_cleaned['Text'].map(lambda x : len(x)).max()

In [None]:
# Splitting the data

X_new_data = df_cleaned['Text']             
y_new_data = df_cleaned['category'] 

In [None]:
# We apply the tokenization to the train and test set

X_new_data = tokenizer.texts_to_sequences(X_new_data)

In [None]:
##padding the data to a shape of 500

X_new_data_padded = pad_sequences(X_new_data, dtype='float32', padding='post', maxlen = 185 )



In [None]:
X_new_data_padded.shape

In [None]:
model.evaluate(X_new_data_padded, y_new_data)

# Merge the 2 cleaned datasets (from 'Hackathon' and from 'Fake and real news dataset')

In [None]:
X_new_data_padded.shape ##data from 'Hackaton'

In [None]:
X_tr.shape # train data from 'Fake and real news dataset'

In [None]:
X_te.shape # test data from 'Fake and real news dataset'

*don't forget to also merge the 3 targets which are y_train, y_test, y_new_data *

In [None]:
y_train.shape, y_test.shape, y_new_data.shape

Marco sends me a big dataset of 65 829 rows, with data and target. I have to split it between data and target y. After, I need to split between train and test. After I need to tokenize. And after I need to run the model on the new big dataset.

*the number of rows we should have in our big dataset at the end from the 3 datasets from kaggle*

In [None]:
nb_rows = 3669 + 29241 + 12532 + 20387
nb_rows 

*and try to add even more datasets because a lot of them are available on Kaggle*