# Notes
* original dataset: https://www.kaggle.com/competitions/fake-news/data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import seaborn as sns
%matplotlib inline


import os
import pickle

# timing utilities
from timeit import default_timer as timer
from datetime import timedelta


In [None]:
input_path = "./data/"
output_path = "./models/"

submit = pd.read_csv(input_path + "submit.csv") # sample labels; I think it's just to show format of output for test predictions
test = pd.read_csv(input_path + "test.csv")
train = pd.read_csv(input_path + "train.csv")

# labels

## Data preprocessing

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# only run to download resources; can specify download directory with argument `download_dir`
# import nltk
# nltk.download("wordnet")
# nltk.download("stopwords")

In [None]:
def clean_text(df, text, stop_words=True, lemmatize=True):
    """Clean text column of a dataframe.
    
    Remove special characters and stop words. Lemmatize words to common root.
    
    Args:
        df (DataFrame) : Dataframe with a text column to clean.
        text (str) : Name of text column in `df`.
        stop_words (bool, default True) : If True, remove English stopwords ("the", "an", etc.).
        lemmatize (bool, default True) : If True, lemmatize words to common root.
    
    Returns:
        (DataFrame) : The original `df` with `text` column replaced with cleaned text.
    """
    
    # basic error handling
    df = df.drop_duplicates()
    df = df.dropna(subset=[text])
    
    # elminating weird characters
    # edited to also drop numbers for BERT
    df[text] = df[text].str.replace(r'[^a-zA-Z]', ' ', regex=True)
    df[text] = df[text].str.lower()
    df = df.dropna(subset=[text])
    
    tokens = df[text].str.split()
    
    if stop_words:
        sw = stopwords.words("english")
        tokens = tokens.apply(lambda row_words: [word for word in row_words if sw.count(word)==0])
    
    if lemmatize:
#         stemmer = PorterStemmer()
#         tokens = tokens.apply(lambda row_words: [stemmer.stem(word) for word in row_words])
        lemmatizer = WordNetLemmatizer()
        tokens = tokens.apply(lambda row_words: [lemmatizer.lemmatize(word) for word in row_words])
    
    tokens = tokens.apply(lambda row_words: ' '.join(row_words))

    df[text] = tokens
    
    return df

In [None]:
train_clean = clean_text(train, "text", stop_words=False, lemmatize=False)

In [None]:
train_clean.text.head(10)

## Word Cloud to view contents

In [None]:
from wordcloud import WordCloud

real_text = train_clean.loc[train_clean.label==0, "text"]
real_text = " ".join([article for article in real_text.astype(str)])

fake_text = train_clean.loc[train_clean.label==1, "text"]
fake_text = " ".join([article for article in fake_text.astype(str)])

Word Cloud of Real Articles

In [None]:
real_wc = WordCloud().generate(real_text)
plt.figure(figsize=(40, 20))
plt.tight_layout(pad=0)
plt.imshow(real_wc, interpolation='bilinear')
plt.title("Word Cloud of Real Articles")
plt.show()

Word Cloud of Fake Articles

In [None]:
fake_wc = WordCloud().generate(fake_text)
plt.figure(figsize=(40, 20))
plt.tight_layout(pad=0)
plt.imshow(fake_wc, interpolation='bilinear')
plt.title("Word Cloud of Fake Articles")
plt.show()

## Model Training 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

X = train_clean.text
y = train_clean.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=42)


In [None]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier

start = timer()

pipe_tfidf = Pipeline([("tfidf", TfidfVectorizer()),
                       ("svd", TruncatedSVD(random_state=42, n_components=4000)),
                       ("xgb", XGBClassifier(random_state=42))]
                     )
params_tfidf = {"tfidf__analyzer": ["word"],
                "tfidf__ngram_range": [(1, 1)],
#                 "svd__n_components": [5000],
                "xgb__n_estimators": [10, 50],
                "xgb__max_depth": [6, 10]
               }
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

gs_tfidf = GridSearchCV(pipe_tfidf, param_grid=params_tfidf, cv=5, scoring=scoring, refit="accuracy")

gs_tfidf.fit(X_train, y_train)

end = timer()
print(f"Time elapsed: {timedelta(seconds=end-start)}")

In [None]:
gs_tfidf.cv_results_

In [None]:
fitted_steps = gs_tfidf.best_estimator_.named_steps
fitted_steps.svd.explained_variance_ratio_.sum() # a little low

In [None]:
fitted_steps = gs_tfidf.best_estimator.named_steps
pickle.dump(fitted_steps["tfidf"], open(output_path + "tfidf_vectorizer.pkl", "wb"))
pickle.dump(fitted_steps["svd"], open(output_path + "svd.pkl", "wb"))
pickle.dump(fitted_steps["xgb"], open(output_path + "xgb_with_tfidf.pkl", "wb"))

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


start = timer()

sent_embedding_transformer = FunctionTransformer(lambda text: model.encode(text.tolist()))

pipe_bert = Pipeline([("embedder", sent_embedding_transformer),
                  ("xgb", XGBClassifier(random_state=42))]
                )

params_bert = {"xgb__n_estimators": [10, 50], 
               "xgb__max_depth":  [6, 10]
              }

gs_bert = GridSearchCV(pipe_bert, params_bert, cv=5)

gs_bert.fit(X_train, y_train)

end = timer()
print(f"Time elapsed: {timedelta(seconds=end-start)}")

In [None]:
gs_bert.cv_results_

In [None]:
pickle.dump(model.encode(train_clean.text.tolist()), open(output_path + "train_sent_embeddings.pkl", "wb"))
pickle.dump(gs_bert.best_estimator_.named_steps["xgb"], open(output_path + "xgb_with_bert.pkl", "wb")

## Model Comparison and Selection

If I already have above things trained, I can just load them as in below (and **don't** fit, just transform):

In [None]:
tfidf = pickle.load(open(output_path + "tfidf_vectorizer.pkl", "rb"))
svd = pickle.load(open(output_path + "svd.pkl", "rb"))
xgb_with_tfidf = pickle.load(open(output_path + "svd.pkl", "rb"))

pipe_tfidf = Pipeline([("tfidf", tfidf), ("svd", svd), ("xgb", xgb_with_tfidf)])


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

sent_embedding_transformer = FunctionTransformer(lambda text: model.encode(text.tolist()))

xgb_with_bert = pickle.load(open(output_path + "xgb_with_bert.pkl", "rb"))

pipe_bert = Pipeline([("embeddings", sent_embedding_transformer), ("xgb", xgb_with_bert)])

In [None]:
# a bit too small to see, but this is what the first tree looks like; also, here the features are the principle components
##set up the parameters
rcParams['figure.figsize'] = 80,50
from xgboost import plot_tree

plot_tree(xgb_with_tfidf)
plt.show()

In [None]:
plot_tree(xgb_with_bert)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report

def get_scores(model, X_test, y_test):
    """ Calculate fitted binary classifcation model performance with respect to various metrics.
    
    Helper function for sklearn.metrics.classification_report for convenience. Calculate fitted
    binary classification model's Accuracy, Precision, Recall, F-1 score, and Support for both 
    classes in input data.
    
    Args:
        model (sklearn.base.ClassifierMixin) : Fitted classifier that implements scikit-learn API. 
        X_test (array-like) : Data to produce predicted labels.
        y_test (1d array-like) : Correct target values to compare with predicted labels. Label
        values assumed to be 0 and 1, with 0 meaning "Real" and 1 meaning "Fake". Class 1 is the
        "positive" class.
        
    Returns:
        (dict) : Dictionary of score results.
    """

    y_pred = model.predict(X_test)
    metrics = classification_report(y_test, y_pred, labels=[0, 1], target_names=["Real", "Fake"],
                                    digits=4, output_dict=True)
    return metrics
    

In [None]:
results_tfidf = get_scores(pipe_tfidf, X_test, y_test)
results_bert = get_scores(pipe_bert, X_test, y_test)
print(results_tfidf)
print(results_bert)

# For BERT

### This section is from when I was trying to use BERT for word embeddings, but it was too computationally expensive a task, and sentence embeddings might be better for a use case. Basically this is scratch work that can be ignored

Basically, a "sentence" is just a linguistic body of words, so I'm going to treat each article as a single sentence, because otherwise I'd have to draw a manual distinction between which half of the article is the first "sentence" and which is second.

Max token amount allowed is 512, I'll have to truncate to first few hundred words, but need leeway for words that aren't recognized so they're broken down into multiple subwords. For instance, I first tried truncating to first 500, but I ended up with 535 tokens.

Following along with this: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

**TODO:**
* want to play around with xgboost's xgb.train and DMatrix's because the XGBClassifier I'm using is just the sklearn wrapper
* Try to make submission csv's using both sets of models

* Taha has a few other suggestions
    * transformers, specifically huggingface -> just pass data through, and it'll produce embeddings
    * a bit about embeddings
        * formed with artificual neural networks
        * remove output layer, add two inner layers: an encoder and then decoder, they're both fully connected
        * usually input layer isn't fully connected to first encoder inner layer, 
        * output is n-dimensional where n is how many nodes in decoder layer
        * maybe truncate to first 500 words before doing this
    * can use pre-trained models like BERT, gp3 something by Deepmind
    * use examples here: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens, and https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
    * consider also using sentence-level instead of word-level
    * he's also mentioning topioc modeling
        * bunch of text, want to extract groups of topics in corpus
            * use this: https://github.com/MaartenGr/BERTopic

* end product could be like a SaaS application where someone can input a news article and predict whether article is fake news
* streamlit or plotly or flask for frontend
    * in flask, create a file as a server
* for a freecodecamp flask tutorial: https://www.youtube.com/watch?v=Z1RJmh_OqeA
    * but only consider that stuff after I get basic model online