# ENSEMBLE MODEL


Make sure you add the models shared drive into MyDrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [45]:
import statistics
import shutil
import torch
import re
import pickle
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os


class Ensemble_Model():
  def __init__(self, text):
    self.text = text
    self.model = None
    self.mlp_classifier = None
    self.svm_classifier = None
    self.nb_classifier = None
    self.tfidf_vec = None
    self.tfidf_text = None
    self.cv_text = None
    self.cv = None
    self.tfidf = None
    self.tokenizer = None
    self.tfm_pred_label = None
    self.tfm_pred_score = None
    self.mlp_pred_label = None
    self.mlp_pred_score = None
    self.svm_pred_label = None
    self.svm_pred_score = None
    self.nb_pred_label = None
    self.nb_pred_score = None
    self.model_list = None

  def import_models(self):
    if not os.path.exists("transformer"):
      shutil.copytree("/content/gdrive/MyDrive/425 models/transformer", "transformer")
    self.model = AutoModelForSequenceClassification.from_pretrained("/content/transformer")
    self.tokenizer = AutoTokenizer.from_pretrained("/content/transformer")

    with open('/content/gdrive/MyDrive/425 models/mlp.pkl', 'rb') as f:
      self.mlp_classifier = pickle.load(f)

    with open('/content/gdrive/MyDrive/425 models/tfidfMLP.pkl', 'rb') as f:
      self.tfidf_vec = pickle.load(f)

    with open('/content/gdrive/MyDrive/425 models/svm.pkl', 'rb') as f:
      self.svm_classifier = pickle.load(f)

    with open('/content/gdrive/MyDrive/425 models/vectorizer.pkl', 'rb') as f:
      self.cv = pickle.load(f)

    with open('/content/gdrive/MyDrive/425 models/tfidf.pkl', 'rb') as f:
      self.tfidf = pickle.load(f)
    with open('/content/gdrive/MyDrive/425 models/naivebayes.pkl', 'rb') as f:
      self.nb_classifier = pickle.load(f)

  def text_processing(self, text):
    input_text = re.sub(r'[\'"‘’“”]', '', text)
    input_text = re.sub('^.*\(Reuters\)\s*-\s*', '', input_text)
    input_text = [input_text]
    self.cv_text = self.cv.transform(input_text)
    self.tfidf_text = self.tfidf.transform(self.cv_text)
    return input_text

  def define_labels(prob):
    if prob >= 0.5:
      pred_class = "Real"  # Real news
    else:
      pred_class = "Fake"  # Fake news
    return pred_class

  def transformers(self,input_text):
    inputs = self.tokenizer(input_text, return_tensors="pt")

    # Forward pass through the model to obtain logits
    with torch.no_grad():
        outputs = self.model(**inputs)

    # Get the predicted class label
    tfm_pred = torch.sigmoid(outputs.logits).squeeze().tolist()  # Assuming binary classification
    if tfm_pred >= 0.5:
        self.tfm_pred_label = "Real"
    else:
        self.tfm_pred_label = "Fake"
    self.tfm_pred_score = torch.sigmoid(outputs.logits).item()
    return


  def mlp(self,input_text):
    input_tfidf = self.tfidf_vec.transform(input_text)
    mlp_prob = self.mlp_classifier.predict_proba(input_tfidf)
    self.mlp_pred_label = Ensemble_Model.define_labels(mlp_prob[0][1])
    self.mlp_pred_score = mlp_prob[0][1]
    return

  def svm(self,input_text):
    svm_prob = self.svm_classifier.decision_function(self.tfidf_text)
    self.svm_pred_label = Ensemble_Model.define_labels(svm_prob)
    self.svm_pred_score = svm_prob[0] if svm_prob[0]>0 else 1+svm_prob[0]
    return

  def naivebayes(self,input_text):
    nb_prob = self.nb_classifier.predict_proba(self.tfidf_text)
    self.nb_pred_label = Ensemble_Model.define_labels(nb_prob[0][1])
    self.nb_pred_score = nb_prob[0][1]
    return

  def predict_list(self):
    if self.nb_classifier == None:
      Ensemble_Model.import_models(self)
    text = Ensemble_Model.text_processing(self,self.text)
    Ensemble_Model.transformers(self,text)
    Ensemble_Model.mlp(self,text)
    Ensemble_Model.svm(self,text)
    Ensemble_Model.naivebayes(self,text)

    self.model_list = ["Transformers", "MLP", "SVM", "Naive Bayes"]
    self.label_list = [self.tfm_pred_label, self.mlp_pred_label, self.svm_pred_label, self.nb_pred_label]
    self.score_list = [self.tfm_pred_score, self.mlp_pred_score, self.svm_pred_score, self.nb_pred_score]
    return

  def predict_df(self):
    if self.model_list == None:
      Ensemble_Model.predict_list(self)

    result_df = pd.DataFrame({"Model": self.model_list,
                            "Label": self.label_list,
                            "Score": self.score_list})
    print(result_df)
    return

  def predict(self):
    if self.model_list == None:
      Ensemble_Model.predict_list(self)

    fake_count = self.label_list.count("Fake")
    real_count = self.label_list.count("Real")

    if fake_count > real_count:
      result_label = "Fake"
      result_score = statistics.mean(self.score_list[i] for i in range(4) if self.score_list[i] < 0.5)
    elif fake_count < real_count:
      result_label = "Real"
      result_score = statistics.mean(self.score_list[i] for i in range(4) if self.score_list[i] > 0.5)
    else:
      result_score = statistics.mean(self.score_list[i] for i in range(4))
      if result_score >= 0.5:
        result_label = "Real"
      else:
        result_label = "Fake"

    print("Predicted label: ", result_label)
    print("Predicted score: ", result_score)
    return

In [46]:
text = input("Input text \n" )
Ensemble_Model(f"""{text}""").predict()
Ensemble_Model(f"""{text}""").predict_df()

Input text 
In a stunning turn of events 11 days before the 2016 presidential election, the FBI announced it is reopenning its investigation into Hillary Clinton’s email server case, by probing newly emerging emails linked to Hillary Clinton.
Predicted label:  Fake
Predicted score:  0.2776253280538472
          Model Label     Score
0  Transformers  Fake  0.499073
1           MLP  Fake  0.142678
2           SVM  Fake  0.225584
3   Naive Bayes  Fake  0.243165
