In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from hyperopt import tpe, STATUS_OK, Trials, hp, fmin

#import mlflow
#mlflow.set_experiment("Mlflow BERT")

import transformers
import time

In [12]:
import tensorflow as tf
# import tensorflow_hub as hub
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

# Bert
import os

os.environ["TF_KERAS"]='1'

In [None]:
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = transformers.AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
        
        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state


             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot

In [14]:
df = pd.read_csv('preprocessed_tweet.csv')
df = df.sample(n=1000)
df.head()

Unnamed: 0,Target,Tweet_preprocessed,Tweet_preprocessed_dl
1143363,4,im so sleepy still not at home but we are done...,i'm so sleepy! still not at home but we are do...
1151624,4,dad give you a breaknever you wouldnt love me...,dad give you a break...never! .... you wouldn'...
1095989,4,be out there dont worry im done w my elongate...,"be out there, dont worry, im done w my elon..."
1220694,4,im on my way to the beach,i'm on my way to the beach!
1569897,4,wowhungarynice to know u here,wow..hungary...nice to know u here.


In [15]:
max_length = 64
batch_size = 100
model_type = 'cardiffnlp/twitter-roberta-base-sentiment'
model = transformers.TFAutoModel.from_pretrained(model_type)
sentences = df['Tweet_preprocessed_dl']

Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [16]:
len(sentences)

1000

In [17]:
df.shape

(1000, 3)

In [None]:
from keras.layers import Bidirectional, LSTM, RepeatVector, Dense, TimeDistributed

model = Sequential(name="LSTM-Model") # Model
model.add(Input(shape=(X_train.shape[1],X_train.shape[2]), name='Input-Layer')) # Input Layer - need to speicfy the shape of inputs
model.add(Bidirectional(LSTM(units=32, activation='tanh', recurrent_activation='sigmoid', stateful=False), name='Hidden-LSTM-Encoder-Layer')) # Encoder Layer
model.add(RepeatVector(Y_train.shape[1], name='Repeat-Vector-Layer')) # Repeat Vector
model.add(Bidirectional(LSTM(units=32, activation='tanh', recurrent_activation='sigmoid', stateful=False, return_sequences=True), name='Hidden-LSTM-Decoder-Layer')) # Decoder Layer
model.add(TimeDistributed(Dense(units=1, activation='linear'), name='Output-Layer')) # Output Layer, Linear(x) = x

##### Step 4 - Compile the model
model.compile(optimizer='adam', # default='rmsprop', an algorithm to be used in backpropagation
              loss='mean_squared_error', # Loss function to be optimized. A string (name of loss function), or a tf.keras.losses.Loss instance.
              metrics=['MeanSquaredError', 'MeanAbsoluteError'], # List of metrics to be evaluated by the model during training and testing. Each of this can be a string (name of a built-in function), function or a tf.keras.metrics.Metric instance. 
              loss_weights=None, # default=None, Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs.
              weighted_metrics=None, # default=None, List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.
              run_eagerly=None, # Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function.
              steps_per_execution=None # Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead.
             )

In [22]:
output = [model(feature) for feature in encoded_input]

In [23]:
output[0]

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[-2.170981 , -0.8745634,  3.8376608]], dtype=float32)>, hidden_states=None, attentions=None)

In [None]:
stop here;

In [None]:
def objective(params):
    with mlflow.start_run(nested=True):
        model = LogisticRegression(**params, random_state=42)
        
        scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, scoring='accuracy')
        
        average_score = round(sum(scores) / 5, 3)
        
        loss = 1 - average_score
        
        
        mlflow.log_metric('Accuracy', average_score)
        mlflow.log_params(params)
        
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [None]:
'''
def objective(params):
    with mlflow.start_run(nested=True):
    
'''

In [None]:
space = {
    'tol': hp.uniform('tol', 0.00001, 0.0001),
    'C': hp.uniform('C', 0.05, 3)
}

In [None]:
model_name = ""

In [None]:
y = df['Target'].copy()
X = df['Tweet_preprocessed_dl'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.head()

In [None]:
X_train[X_train.isna()]

In [None]:
vectorizer_name = "tfidf"

vectorizer_params = {"stop_words": "english", "max_df": 0.95, "min_df": 1}

vectorizers = {
    'count-vectorizer': CountVectorizer(**vectorizer_params),
    'tfidf': TfidfVectorizer(**vectorizer_params)
}

vectorizer = vectorizers[vectorizer_name]

vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
tpe_algorithm = tpe.suggest

bayes_trials = Trials()

with mlflow.start_run(run_name="hyperopt_logistic") as run:
    best = fmin(fn=objective, space=space, algo=tpe_algorithm, max_evals=10, trials=bayes_trials)
    best = {key:float(value) for key, value in best.items()}
    
    
    mlflow.log_dict(best, "best_params.json")

In [None]:
print(best)

In [None]:
from time import perf_counter
from itertools import product

params = {
    'tol': [0.00001, 0.0001],
    'C': [0.05, 0.1, 0.5, 1]
}


with mlflow.start_run():
    for t, c in product(params['tol'], params['C']):
    
        mlflow.log_params({"vectorizer": vectorizer_name})
        
        print(f"training model with params: tol:{t}, C:{c}")
        
        model = LogisticRegression(tol=t, C=c, max_iter=1000)
        
        start = perf_counter()

        model.fit(X_train, y_train)
        
        
        y_pred = model.predict(X_test)
        
        end = perf_counter()
        
        duration = round(end - start, 2)
        
        score = accuracy_score(y_test, y_pred)
        
        print(f"{score=}")
        mlflow.log_metric("accuracy", score)
        mlflow.log_metric("duration", duration)

In [None]:
mlflow.set_experiment("Bert_model")

with mlflow.start_run():
    #mlflow.log_params({"vectorizer": vectorizer_name})
    
    start = perf_counter()

    bert_model.fit
    
    
    y_pred = bert_model.predict(X_test)
    
    end = perf_counter()
    
    duration = round(end - start, 2)
    
    score = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", score)
    mlflow.log_metric("duration", duration)