# Task

"""
Dataset:

Use accuracy, confusion matrix (class-wise) as a metric for multi-class classification.
Use accuracy, Precision, Recall, F1 score and confusion matrix as a metric for binary classification.
Report hyperparameters for all deep models, like learning rate, optimiser, number of epochs, and scheduler.
Show train/val loss and accuracy plots for deep neural networks.


Tasks:

Define your own train-val-test split.
Define a text preprocessing pipeline, i.e., stopword removal, lower casing, punctuation removal etc. [Report your text preprocessing pipeline in the report.]
Developing ML
Count vectorizer features.
TF-IDF features.
Model a decision tree with TF-IDF features. [Compare with 3.a.ii]

Developing Deep neural networks:

Implement Any transformer model.

USE Fastapi Framework
has context menu

"""

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [None]:
false = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')
false.head()

FileNotFoundError: ignored

In [None]:
false.info()

In [None]:
true.head()

In [None]:
true.info()

In [None]:
true.describe()

In [None]:
false.describe()

In [None]:
false = false.drop_duplicates()
true = true.drop_duplicates()

In [None]:
false['subject'].value_counts()

In [None]:
true['subject'].value_counts()

In [None]:
# df['date'] = pd.to_datetime(df['date'], errors='coerce')
# df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# df[df['date'].isnull()].index
# # Int64Index([9358, 15507, 15508, 15839, 15840, 17432, 17433, 18933, 21869, 21870] with invalid dates
# df = df.dropna(subset=['date'])
# df['date'] = pd.to_datetime(df['date'])

In [None]:
# fake[fake.duplicated(subset=['title', 'text', 'date'])]

In [None]:
true['category'] = 1
false['category'] = 0

In [None]:
df = pd.concat([true,false]).reset_index() #Merging the 2 datasets

In [None]:
df.category

In [None]:
sns.set_style("darkgrid")
sns.countplot(data=df, x='category')

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize = (12,8))
sns.set(style = "whitegrid",font_scale = 1.2)
chart = sns.countplot(x = "subject", hue = "category" , data = df)
chart.set_xticklabels(chart.get_xticklabels(),rotation=90)

In [None]:
df['text'] = df['text'] + " " + df['title']
del df['title']
del df['subject']
del df['date']

In [None]:
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Remove stopwords and add stemming

    words = [stemmer.stem(word) for word in words if word not in stop_words]

    # Join the cleaned and stemmed words back into a text
    text = ' '.join(words)

    return text

# Apply text preprocessing to the 'content' column
df['text'] = df['text'].apply(preprocess_text)

In [None]:
plt.figure(figsize = (20,20)) # Text that is not Fake
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(df[df.category == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
plt.figure(figsize = (20,20)) # Text that is Fake
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = STOPWORDS).generate(" ".join(df[df.category == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=df[df['category']==1]['text'].str.len()
ax1.hist(text_len,color='red')
ax1.set_title('Original text')
text_len=df[df['category']==0]['text'].str.len()
ax2.hist(text_len,color='green')
ax2.set_title('Fake text')
fig.suptitle('Characters in texts')
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
text_len=df[df['category']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='red')
ax1.set_title('Original text')
text_len=df[df['category']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='green')
ax2.set_title('Fake text')
fig.suptitle('Words in texts')
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(20,10))
word=df[df['category']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('Original text')
word=df[df['category']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Fake text')
fig.suptitle('Average word length in each text')

In [None]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.33, random_state=0)

In [None]:
## TFidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X_train=tfidf_v.fit_transform(X_train)
X_test= tfidf_v.transform(X_test)
y=df['category']

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example:
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [None]:
from sklearn import metrics
import numpy as np
import itertools

In [None]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred)
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Initialize StratifiedKFold for k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = df['text']
y = df['category']
# Initialize an empty list to store cross-validation scores
cv_scores = []
tfidf_vectorizer=TfidfVectorizer(max_features=5000,ngram_range=(1,3))

# Perform k-fold cross-validation
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit and transform the TF-IDF vectorizer on the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Transform the test data using the same vectorizer
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Fit the classifier on the TF-IDF transformed training data
    classifier.fit(X_train_tfidf, y_train)

    # Make predictions on the TF-IDF transformed test data
    pred = classifier.predict(X_test_tfidf)

    # Calculate accuracy for this fold
    score = metrics.accuracy_score(y_test, pred)
    cv_scores.append(score)

# Calculate and print the mean and standard deviation of cross-validation scores
mean_score = np.mean(cv_scores)
std_score = np.std(cv_scores)
print(f"Mean accuracy: {mean_score:.3f}")
print(f"Standard deviation: {std_score:.3f}")