<a href="https://colab.research.google.com/github/c0pper/stylometry/blob/main/expertiment_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.9 MB 7.6 MB/s 
[K     |████████████████████████████████| 120 kB 53.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 37.9 MB/s 
[?25h

In [None]:
from typing import Union
import math 
from sklearn import preprocessing, metrics
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from datetime import datetime
import joblib
from tqdm import tqdm
from transformers import TFBertModel, BertTokenizer
import numpy as np
from tensorflow import keras
from requests import get
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
from tensorflow.keras.models import load_model
import time
import json
from tensorflow.keras.layers import Dense, Dropout
import os
from tqdm import tqdm
tqdm.pandas()

MODEL_SAVEPATH = "/content/drive/MyDrive/simo/"
TIMENOW = datetime.now().strftime('%d-%m-%y-%H-%M')

os.makedirs("/content/drive/MyDrive/simo/logs", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/logs/sklearn", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/logs/bert", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/logs/stylo", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/models", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/models/sklearn", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/models/bert", exist_ok=True)
os.makedirs("/content/drive/MyDrive/simo/models/stylo", exist_ok=True)

def merge_datasets(dataset_list: list, target_col: str):
  round_threshold = 0.49
  processed_datasets = []
  list90 = []
  list10 = []

  for idx, d in enumerate(dataset_list):
    print(f"original dataset {idx} shape", d.shape)
    original_shape = d.shape[0]

    d90 = pd.DataFrame()
    d10 = pd.DataFrame()
    values_form_target = pd.unique(d[target_col].squeeze())
    values_shapes = []
    for v in values_form_target:
      d_label = d[(d[target_col] == v)]
      # print(d_label[target_col])
      value_shape = d_label.shape[0]
      values_shapes.append(value_shape)

      d_label90perc, d_label10perc = np.split(d_label, [int(.9*len(d_label))])
      print(f"shape 90% for {v}: {d_label90perc.shape[0]} == shape*0.9: {value_shape*0.9}")
      print(f"shape 10% for {v}: {d_label10perc.shape[0]} == shape*0.1: {value_shape*0.1}")

      d90 = d90.append(d_label90perc, ignore_index=True) # unisco i 2 sottodataset contenenti solo label1 e label2
      d10 = d10.append(d_label10perc, ignore_index=True) # unisco i 2 sottodataset contenenti solo label1 e label2
    assert(sum(values_shapes) == original_shape)
    
    d90 = d90.sample(frac=1, random_state=42) # mischio le righe per evitare che ci siano prima tutti label1 e poi tutti label2
    d10 = d10.sample(frac=1, random_state=42)
    assert((d90.shape[0] + d10.shape[0]) == original_shape)

    print("\n90% of dataset\n", d90.groupby(target_col)[target_col].count())# stampo il conteggio delle classi presenti nella nuova coppia di dataset derivata dall'originale
    print("\n10% of dataset\n", d10.groupby(target_col)[target_col].count(), "\n\n\n")
    processed_datasets.append((d90, d10))

  for tup in processed_datasets:
    df90 = tup[0]
    df10 = tup[1]
    list90.append(df90)
    list10.append(df10)
  merged90 = pd.DataFrame()
  merged10 = pd.DataFrame()
  for df in list90:
    merged90 = merged90.append(df)
  for df in list10:
    merged10 = merged10.append(df)

  return(merged90, merged10)

def preprocess(text,stem=False):
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    text = text.lower()  # lowercase

    text = re.sub(r'[!]+', '!', text)
    text = re.sub(r'[?]+', '?', text)
    text = re.sub(r'[.]+', '.', text)
    text = re.sub(r'â€™', "'", text)
    text = re.sub(r'â€œ', "'", text)
    text = re.sub(r'â€', "'", text)
    text = re.sub(r'â€˜', "'", text)
    text = re.sub(r'â‚¬', "€", text)
    text = re.sub(r"'", "", text)
    text = re.sub('\s+', ' ', text).strip()  # Remove and double spaces
    text = re.sub(r'&amp;?', r'and', text)  # replace & -> and
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)  # Remove URLs
    # remove some puncts (except . ! # ?)
    text = re.sub(r'[:"$%&\*+,-/:;<=>@\\^_`{|}~]+', '', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'EMOJI', text)
    
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(lemmatizer.lemmatize(token))
    return " ".join(tokens)



def apply_preprocess(xtrain, xvalid):
  print("\nPreprocessing texts...")
  print(f"\nBefore: {(xtrain.iloc[0][:50] + '..') if len(xtrain.iloc[0]) > 50 else xtrain.iloc[0]}")
  xtrain = xtrain.progress_apply(lambda x: preprocess(x))
  xvalid = xvalid.progress_apply(lambda x: preprocess(x))
  print(f"\nAfter: {(xtrain.iloc[0][:50] + '..') if len(xtrain.iloc[0]) > 50 else xtrain.iloc[0]}")

  return xtrain, xvalid


class Experiment:
  scaler = StandardScaler()
  lbl_enc = preprocessing.LabelEncoder()

  def __init__(self, dataset_path: Union[str, pd.DataFrame], split_size: float, target_col: int, model_savepath=MODEL_SAVEPATH):
    self.dataset_path = dataset_path
    self.split_size = split_size
    self.target_col = target_col
    if isinstance(self.dataset_path, str):
      self.dataset_name = dataset_path.split(".")[-2].split("/")[-1]
    elif isinstance(self.dataset_path, pd.DataFrame):
      self.dataset_name = input("Dataset name not found. Please enter dataset name: ")
    self.model_savepath = model_savepath
    self.nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split(".")[0]


  def load_split_dataset(self, dataset_path, dropna=False, do_split=True, use_scaler=False):
    if isinstance(dataset_path, str): 
      format = dataset_path.split(".")[-1]
      valid = {"csv", "xlsx", "xls"}
      if format not in valid:
        raise ValueError(f"results: status must be one of {valid}.")
      elif format == "csv":
        dataset = pd.read_csv(dataset_path)
      elif (format == "xlsx" or format == "xls"):
        dataset = pd.read_excel(dataset_path)
    else:
      dataset = dataset_path

    print("Dataset head\n") 
    print(dataset.head())

    X = dataset.drop(self.target_col, axis=1)
    y = self.lbl_enc.fit_transform(dataset[self.target_col].values)

    #dropping nans
    if dropna:
      print("DROPPING NAN")
      dataset = dataset.dropna(axis=0, how='any')

    if use_scaler:
      X = self.scaler.fit_transform(X)

    
    listy = list(self.lbl_enc.inverse_transform(y))
    print("Dataset class distribution:")
    for i in set(listy):
      print(i, listy.count(i))

    if do_split:
      xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, random_state=42, test_size=self.split_size, shuffle=True)
      return xtrain, xvalid, ytrain, yvalid
    else:
      return X, y


  def print_cm(self, yvalid, predicted, target_names=[]): 
    cm = metrics.confusion_matrix(yvalid, predicted)
    disp = metrics.ConfusionMatrixDisplay(cm, display_labels=target_names)
    disp.plot(xticks_rotation="vertical")

  def print_report(self, predicted, yvalid, target_names=None):
    report_dict = metrics.classification_report(yvalid, predicted, target_names=[str(x) for x in target_names], output_dict=True)
    report_text = metrics.classification_report(yvalid, predicted, target_names=[str(x) for x in target_names])
    print(report_text)
    self.print_cm(yvalid, predicted, target_names=target_names)
    return report_dict

  
  def load_test_dataset(self, testdataset_path, dropna):
    X, y = self.load_split_dataset(testdataset_path, dropna=dropna, do_split=False)
    target_names = self.lbl_enc.inverse_transform(list(set(y)))
    return X, y, target_names


class PublicExpertiment(Experiment):
  def __init__(self, dataset_path, split_size, target_col, text_col, model_savepath=MODEL_SAVEPATH, preprocess_dataset=True):
    super().__init__(dataset_path, split_size, target_col, model_savepath)
    self.text_col = text_col
    self.preprocess_dataset = preprocess_dataset


class ScikitExperiment(PublicExpertiment):
  def __init__(self, dataset_path, split_size, target_col, text_col, algo, model_savepath=MODEL_SAVEPATH, preprocess_dataset=True):
    super().__init__(dataset_path=dataset_path, 
                     split_size=split_size, 
                     target_col=target_col, 
                     text_col=text_col, 
                     model_savepath=model_savepath, 
                     preprocess_dataset=preprocess_dataset
                     )
    self.algo = algo


  def train(self, dropna=False):
    start = time.time()
    xtrain, xvalid, ytrain, yvalid = super().load_split_dataset(self.dataset_path, dropna=dropna)
    xtrain = xtrain[self.text_col]
    xvalid = xvalid[self.text_col]

    if self.preprocess_dataset:
      xtrain, xvalid = apply_preprocess(xtrain, xvalid)
      
    clf_pipeline = Pipeline([
     ('ctv', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', self.algo),
    ])
    print(f"Fitting pipeline: {clf_pipeline}")
    clf_pipeline.fit(xtrain, ytrain)
    end = time.time()
    elapsed = round(end - start, 2)
    predicted = clf_pipeline.predict(xvalid)
    report = super().print_report(predicted, yvalid, target_names=self.lbl_enc.inverse_transform(list(set(yvalid))))

    self.save_model(clf_pipeline, self.lbl_enc)
    print("Time elapsed in seconds: ", round(elapsed, 2))
    
    log_dict = self.log(self.model_savepath, self.dataset_name, len(xtrain)+len(xvalid), type(self.algo).__name__, elapsed, report)
    return log_dict

  
  def save_model(self, model, lbl_enc):
    algo_name = type(self.algo).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name
    save_confirmation = input(f"Save model {experiment_name}? (y/n)")
    if save_confirmation == "y":
      filepath = f'{self.model_savepath}models/sklearn/{experiment_name}.pkl'
      print(f"Saving model to {filepath}")
      data = {
          "model": model,
          "lbl_enc": lbl_enc
      }
      joblib.dump(data, filepath)
    else:
      print("Model wasn't saved")


  def load_model_and_predict(self, modelpath, X):
    data = joblib.load(modelpath, mmap_mode=None)
    model = data["model"] #sklearn  
    predicted = model.predict(X) 
    return predicted


  def evaluate_on_other_dataset(self, testdataset_path: Union[str, pd.DataFrame], modelpath: str, dropna=False):
    start = time.time()
    if isinstance(testdataset_path, str):
      format = testdataset_path.split(".")[-1]
      valid = {"csv", "xlsx", "xls"}
      if format not in valid:
        raise ValueError(f"results: status must be one of {valid}.")
      elif format == "csv":
        dataset = pd.read_csv(testdataset_path)
      elif (format == "xlsx" or format == "xls"):
        dataset = pd.read_excel(testdataset_path)
      X = dataset[self.text_col]
      y = dataset[self.target_col]
    elif isinstance(testdataset_path, pd.DataFrame):
      X = testdataset_path[self.text_col]
      y = testdataset_path[self.target_col]
    print(X.shape)

    if self.preprocess_dataset:
      print("Preprocessing text...")
      X = X.progress_apply(lambda x: preprocess(x))
    target_names = list(set(y))
    lbl_enc = joblib.load(modelpath, mmap_mode=None)["lbl_enc"]

    predicted = self.load_model_and_predict(modelpath, X)
    super().print_report(predicted, lbl_enc.transform(y), target_names)
    end = time.time()
    elapsed = round(end - start, 2)
    print("Time elapsed in seconds: ", round(elapsed, 2))


  def log(self, savepath, datasetname, dataset_len, algo, elapsed, report):
    log_dict = {
        "library_used": type(self).__name__,
        "dataset_name": datasetname,
        "dataset_lenght": dataset_len,
        "algo": algo,
        "elapsed": elapsed,
        "metrics_report": report
    }
    algo_name = type(self.algo).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name
    filepath = f'{savepath}logs/sklearn/{experiment_name}_log.json'
    with open(filepath, 'w') as fp:
      json.dump(log_dict, fp)
      print("Log saved to ", filepath)
    return log_dict


class TFExperiment(PublicExpertiment):
  def __init__(self, dataset_path, split_size, text_col, target_col, preprocess_dataset=True, model_savepath=MODEL_SAVEPATH, bert_pretrained_model='bert-large-uncased', bert_encode_maxlen=60):
    super().__init__(dataset_path=dataset_path, 
                     split_size=split_size, 
                     text_col=text_col, 
                     target_col=target_col, 
                     preprocess_dataset=preprocess_dataset, 
                     model_savepath=model_savepath)
    self.bert_pretrained_model = bert_pretrained_model
    self.bert_encode_maxlen = bert_encode_maxlen
  

  def bert_encode(self, data, max_len) :
    bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
    input_ids = [] 
    attention_masks = []
    
    for i in tqdm(range(len(data))):
        encoded = bert_tokenizer.encode_plus(data.iloc[i],
                                        add_special_tokens=True,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        return_attention_mask=True)
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)


  def create_model(self, bert_encode_maxlen, bert_pretrained_model, optimizer, loss, metrics):
    bert_layers = TFBertModel.from_pretrained(bert_pretrained_model)

    input_ids = keras.Input(shape=(bert_encode_maxlen,),dtype='int32',name='input_ids')
    attention_masks = keras.Input(shape=(bert_encode_maxlen,),dtype='int32',name='attention_masks')

    output = bert_layers([input_ids,attention_masks])
    output = output[1]
    net = keras.layers.Dense(32,activation='relu')(output)
    net = keras.layers.Dropout(0.2)(net)
    net = keras.layers.Dense(1,activation='sigmoid')(net)
    outputs = net
    model = keras.models.Model(inputs = [input_ids,attention_masks],outputs = outputs)

    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=[metrics])
    
    model.summary()
    return model


  def train(self, bert_encode_maxlen=None, bert_pretrained_model=None, dropna=False, epochs=10, optimizer=keras.optimizers.Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics='accuracy', callbacks=[]):
    start = time.time()
    if bert_encode_maxlen is None:
      bert_encode_maxlen = self.bert_encode_maxlen
    if bert_pretrained_model is None:
      bert_pretrained_model = self.bert_pretrained_model

    xtrain, xvalid, ytrain, yvalid = super().load_split_dataset(self.dataset_path, dropna=dropna)
    xtrain = xtrain[self.text_col]
    xvalid = xvalid[self.text_col]

    if self.preprocess_dataset:
      xtrain, xvalid = apply_preprocess(xtrain, xvalid)

    train_input_ids, train_attention_masks = self.bert_encode(xtrain, bert_encode_maxlen)
    val_input_ids, val_attention_masks = self.bert_encode(xvalid, bert_encode_maxlen)

    model = self.create_model(bert_encode_maxlen, bert_pretrained_model, optimizer=optimizer, loss=loss, metrics=metrics)

    history = model.fit(
    [train_input_ids, train_attention_masks],
    ytrain,
    epochs=epochs,
    # validation_data=([val_input_ids, val_attention_masks], y_val),
    batch_size=32, 
    # callbacks=callbacks for now no callbacls
    )
    end = time.time()
    elapsed = round(end - start, 2)

    predicted = model.predict([val_input_ids, val_attention_masks])
    predicted = np.array(list(round(i[0]) for i in predicted))
    report = super().print_report(predicted, yvalid, target_names=self.lbl_enc.inverse_transform(list(set(yvalid))))

    self.save_model(model, self.lbl_enc)
    print("Time elapsed in seconds: ", round(elapsed, 2))
    
    log_dict = self.log(self.model_savepath, self.dataset_name, model, len(xtrain)+len(xvalid), elapsed, bert_encode_maxlen, epochs, bert_pretrained_model, optimizer, report)

  
  def save_model(self, model, lbl_enc):
    algo_name = type(model).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name
    save_confirmation = input(f"Save model {experiment_name}? (y/n)")
    if save_confirmation == "y":
      filepath = f'{self.model_savepath}models/bert/{experiment_name}.h5'
      lbl_enc_path = f'{self.model_savepath}models/bert/{experiment_name}__lbl_enc.pkl'
      data = {
          "model": model,
          "lbl_enc": lbl_enc
      }
      print(f"Saving model to {filepath}")
      # joblib.dump(data, filepath)
      joblib.dump(lbl_enc, lbl_enc_path)
      model.save(f'{self.model_savepath}models/bert/{experiment_name}.h5')
    else:
      print("Model wasn't saved")


  def load_model_and_predict(self, modelpath, X):
    # data = joblib.load(modelpath, mmap_mode=None)
    # model = data["model"]
    model = load_model(modelpath, custom_objects={'TFBertModel':TFBertModel.from_pretrained(self.bert_pretrained_model)}) 
    predicted = model.predict(X) 
    predicted = np.array(list(round(i[0]) for i in predicted))
    return predicted


  def evaluate_on_other_dataset(self, testdataset_path, modelpath, text_col, fitted_lbl_enc, dropna=False):
    start = time.time()
    if isinstance(testdataset_path, str):
      format = testdataset_path.split(".")[-1]
      valid = {"csv", "xlsx", "xls"}
      if format not in valid:
        raise ValueError(f"results: status must be one of {valid}.")
      elif format == "csv":
        dataset = pd.read_csv(testdataset_path)
      elif (format == "xlsx" or format == "xls"):
        dataset = pd.read_excel(testdataset_path)
      X = dataset[self.text_col]
      y = dataset[self.target_col]
    elif isinstance(testdataset_path, pd.DataFrame):
      X = testdataset_path[self.text_col]
      y = testdataset_path[self.target_col]
      
    target_names = list(set(y))
    lbl_enc = joblib.load(modelpath, mmap_mode=None)["lbl_enc"]

    if self.preprocess_dataset:
      print("Preprocessing text...")
      X = X.progress_apply(lambda x: preprocess(x))
    input_ids, attention_masks = self.bert_encode(X, self.bert_encode_maxlen)

    predicted = self.load_model_and_predict(modelpath, [input_ids, attention_masks])
    super().print_report(predicted, lbl_enc.transform(y), target_names)
    end = time.time()
    elapsed = round(end - start, 2)
    print("Time elapsed in seconds: ", round(elapsed, 2))


  def log(self, savepath, datasetname, model, dataset_len, elapsed, bert_encode_maxlen, epochs, bert_pretrained_model, optimizer, report):
    log_dict = {
        "library_used": type(self).__name__,
        "dataset_name": datasetname,
        "dataset_lenght": dataset_len,
        "elapsed": elapsed,
        "bert_encode_maxlen": bert_encode_maxlen,
        "epochs": epochs,
        "bert_pretrained_model": bert_pretrained_model,
        "optimizer": str(optimizer),
        "metrics_report": report
    }
    algo_name = type(model).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name
    filepath = f'{savepath}logs/bert/{experiment_name}_log.json'
    with open(filepath, 'w') as fp:
      json.dump(log_dict, fp)
      print("Log saved to ", filepath)
    return log_dict


class StyloExperiment(Experiment):
  def __init__(self, dataset_path, split_size, target_col, model_savepath=MODEL_SAVEPATH):
    super().__init__(dataset_path=dataset_path, 
                     split_size=split_size, 
                     target_col=target_col, 
                     model_savepath=model_savepath)


  def train(self, 
            epochs=10, 
            use_scaler=True, 
            n_layers=1, 
            n_units_input=51,
            n_units_per_layer=None,
            dropout_per_layer=None,
            activation="relu",
            learning_rate=0.0014392587661767942,
            optimizer="RMSprop"
            ):
    start = time.time()
    xtrain, xvalid, ytrain, yvalid = super().load_split_dataset(self.dataset_path, use_scaler=use_scaler)

    if not n_units_per_layer:
      n_units_per_layer = [80]
    if not dropout_per_layer:
      dropout_per_layer = [0.3203504513234906]

    nn_parameters = {
      "n_layers": n_layers,
      "n_units_input": n_units_input,
      "activation": activation,
      "n_units_per_layer": n_units_per_layer,
      "dropout_per_layer": dropout_per_layer,
      "learning_rate": learning_rate,
      "optimizer": optimizer
    }

    model = keras.models.Sequential()
    model.add(
        Dense(
            nn_parameters["n_units_input"], 
            input_dim=xtrain.shape[1],
            activation=nn_parameters["activation"],
          )
    )
    for i in range(nn_parameters["n_layers"]):
      model.add(
          Dense(
            nn_parameters["n_units_per_layer"][i],
            activation=nn_parameters["activation"],
          )
      )
      model.add(
          Dropout(nn_parameters["dropout_per_layer"][i])
      )
    model.add(Dense(1, activation="sigmoid"))

    # We compile our model with a sampled learning rate.
    learning_rate = nn_parameters["learning_rate"]
    optimizer_name = nn_parameters["optimizer"]
    model.compile(
        loss="binary_crossentropy",
        optimizer=getattr(keras.optimizers, optimizer_name)(learning_rate=learning_rate),
        metrics=["accuracy"],
    )

    history = model.fit(
        xtrain,
        ytrain,
        batch_size=512, 
        epochs=epochs,
        validation_data=(xvalid, yvalid)
    )
    end = time.time()
    elapsed = round(end - start, 2)

    predicted = model.predict(xvalid)
    predicted = np.array(list(round(i[0]) for i in predicted))
    report = super().print_report(predicted, yvalid, target_names=self.lbl_enc.inverse_transform(list(set(yvalid))))
    self.save_model(model, self.scaler, self.lbl_enc)
    print("Time elapsed in seconds: ", round(elapsed, 2))
    
    log_dict = self.log(self.model_savepath, self.dataset_name, model, len(xtrain)+len(xvalid), elapsed, epochs, nn_parameters, report)

  
  def save_model(self, model, scaler, lbl_enc):
    algo_name = type(model).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name+"_stilometria"
    save_confirmation = input(f"Save model {experiment_name}? (y/n)")
    if save_confirmation == "y":
      filepath = f'{self.model_savepath}models/stylo/{experiment_name}.pkl'
      data = {
          "model": model,
          "scaler": scaler,
          "lbl_enc": lbl_enc
      }
      print(f"Saving model to {filepath}")
      joblib.dump(data, filepath)
      # model.save(f'{self.model_savepath}{experiment_name}.h5')
    else:
      print("Model wasn't saved")

  def load_model_and_predict(self, modelpath, X):
    data = joblib.load(modelpath, mmap_mode=None)
    model = data["model"]
    predicted = model.predict(X) 
    predicted = np.array(list(round(i[0]) for i in predicted))
    return predicted


  def evaluate_on_other_dataset(self, testdataset_path, modelpath, dropna=False, use_scaler=True):
    start = time.time()
    if isinstance(testdataset_path, str):
      format = testdataset_path.split(".")[-1]
      valid = {"csv", "xlsx", "xls"}
      if format not in valid:
        raise ValueError(f"results: status must be one of {valid}.")
      elif format == "csv":
        testdataset = pd.read_csv(testdataset_path)
      elif (format == "xlsx" or format == "xls"):
        testdataset = pd.read_excel(testdataset_path)
      X = testdataset.drop(self.target_col, axis=1)
      y = testdataset[self.target_col]
    elif isinstance(testdataset_path, pd.DataFrame):
      X = testdataset_path.drop(self.target_col, axis=1)
      y = testdataset_path[self.target_col]

    target_names = list(set(y))
    lbl_enc = joblib.load(modelpath, mmap_mode=None)["lbl_enc"]
    scaler = joblib.load(modelpath, mmap_mode=None)["scaler"]

    if use_scaler:
      X = scaler.transform(X)

    predicted = self.load_model_and_predict(modelpath, X)
    super().print_report(predicted, lbl_enc.transform(y), target_names)
    end = time.time()
    elapsed = round(end - start, 2)
    print("Time elapsed in seconds: ", round(elapsed, 2))


  def log(self, savepath, datasetname, model, dataset_len, elapsed, epochs, nn_parameters, report):
    log_dict = {
        "library_used": type(self).__name__,
        "dataset_name": datasetname,
        "dataset_lenght": dataset_len,
        "elapsed": elapsed,
        "epochs": epochs,
        "nueral_net_parameters": nn_parameters,
        "metrics_report": report
    }
    algo_name = type(model).__name__
    experiment_name = self.nb_name+"__"+self.dataset_name+"__"+algo_name+"_stilometria"
    filepath = f'{savepath}logs/stylo/{experiment_name}_log.json'
    with open(filepath, 'w') as fp:
      json.dump(log_dict, fp)
      print("Log saved to ", filepath)
    return log_dict

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
exp = StyloExperiment()