# Packages

In [0]:
# import dataiku
# from dataiku import pandasutils as pdu

import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay  
from sklearn.metrics import classification_report  

import matplotlib.pyplot as plt  # Added import for plt

from datetime import datetime
import os

from edf_commons.modelling import preprocess_data_for_dl


# Variables

In [0]:
LABEL_MAPPING = {'very negative': 0, 'negative': 1, 'neutral': 2, 'positive': 3, 'very positive': 4}
INDEX_MAPPING = {v: k for k, v in LABEL_MAPPING.items()}


# Input

In [0]:
tweets_train = dataiku.Dataset("tweets_train")
df = tweets_train.get_dataframe()
df = df.sample(5000)

# Deep Learning

In [0]:
# Preprocess the data
y = df['label']
X, y = preprocess_data_for_dl(df[['tweet_length_chars', 'tweet_length_words', 'text']], y)


In [0]:
def apply_and_evaluate_deep_learning_model(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, tf.keras.callbacks.History, pd.DataFrame]:
    """
    Applies a deep learning model to the preprocessed data and evaluates its performance.

    Parameters:
    X (pd.DataFrame): The preprocessed data including numerical, categorical, and text data.
    y (pd.Series): The labels.

    Returns:
    pd.DataFrame: A DataFrame containing the loss and accuracy of the model on the test data.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    num_classes = len(y.unique())

    # Convert processed data to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

    # Batch the datasets
    train_dataset = train_dataset.batch(32)
    test_dataset = test_dataset.batch(32)

    # Initialize the model
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128),
        LSTM(64, return_sequences=True),
        Dropout(0.5),
        LSTM(64),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # Sigmoid for binary classification (positive/negative sentiment)
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.0005), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])

    # Train the model
    history = model.fit(train_dataset, epochs=2, validation_data=test_dataset, verbose=1)
    
    # Evaluate the model
    loss, accuracy = model.evaluate(test_dataset, verbose=0)

    # Obtenir les prédictions
    y_pred = model.predict(X_test.values)
    y_pred_classes = y_pred.argmax(axis=1)

    # Afficher la matrice
    cm = confusion_matrix(y_test, y_pred_classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()

    # Save the image
    plt.savefig('graphs/confusion_matrix.png')

    # Générer le rapport
    report_dict = classification_report(y_test, y_pred_classes, output_dict=True, target_names=y_test.unique())
    print(report_dict)

    # Le convertir en DataFrame
    report_df = pd.DataFrame(report_dict).transpose()

    # Save metrics into a DataFrame
    metrics_df = pd.DataFrame({'average_loss': [loss], 'average_accuracy': [accuracy]})
    
    return metrics_df, history, report_df


In [0]:
# Evaluate model on encrypted data
metrics, history, report_df = apply_and_evaluate_deep_learning_model(X, y)


In [0]:
print(f"Metrics on encrypted data:")
metrics.head()


In [0]:
report_df.head()

In [0]:
pd.DataFrame(history.history).plot()