In [0]:
import dataiku

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


import pandas as pd

In [0]:
# Dataset prepared_tweets_encryption renamed to prepared_tweets by anne-soline.guilbert-ly@dataiku.com on 2025-03-27 14:30:30
prepared_tweets_encryption = dataiku.Dataset("tweets_train")

df = prepared_tweets_encryption.get_dataframe()
df = df.sample(5000)

In [0]:
def preprocess_data(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
    """
    Preprocesses the data by handling categorical variables with one-hot encoding,
    numerical columns with standard scaling, and text data with tokenization and padding.

    Parameters:
    X (pd.DataFrame): The data to preprocess.
    y (pd.Series): The label to encode.

    Returns:
    pd.DataFrame: The processed data including numerical, categorical, and text data.
    pd.Series: The processed label.
    """

    # Handle numerical columns with standard scaling
    numerical_features = ['tweet_length_chars', 'tweet_length_words']
    scaler = StandardScaler()
    X_numerical = scaler.fit_transform(X[numerical_features])

    # Handle text data with tokenization and padding
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X['text'].astype(str))  # Ensure text data is string
    X_text = tokenizer.texts_to_sequences(X['text'].astype(str))  # Ensure text data is string
    X_text = pad_sequences(X_text, maxlen=100)

    # Concatenate processed categorical, numerical, and text features
    X_processed = pd.concat([pd.DataFrame(X_numerical, columns=numerical_features), pd.DataFrame(X_text)], axis=1)

    label_encoder = LabelEncoder()
    y = pd.Series(label_encoder.fit_transform(y))

    return X_processed, y

def apply_and_evaluate_deep_learning_model(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, tf.keras.callbacks.History, pd.DataFrame]:
    """
    Applies a deep learning model to the preprocessed data and evaluates its performance.

    Parameters:
    X (pd.DataFrame): The preprocessed data including numerical, categorical, and text data.
    y (pd.Series): The labels.

    Returns:
    pd.DataFrame: A DataFrame containing the loss and accuracy of the model on the test data.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    num_classes = len(y.unique())

    # Convert processed data to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

    # Batch the datasets
    train_dataset = train_dataset.batch(32)
    test_dataset = test_dataset.batch(32)

    # Initialize the model
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=100),
        LSTM(64, return_sequences=True),
        Dropout(0.5),
        LSTM(64),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # Sigmoid for binary classification (positive/negative sentiment)
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.0005), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])

    # Train the model
    history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, verbose=1)
    
    # Evaluate the model
    loss, accuracy = model.evaluate(test_dataset, verbose=0)

    # Obtenir les prédictions
    y_pred = model.predict(X_test.values)
    y_pred_classes = y_pred.argmax(axis=1)

    # Afficher la matrice
    cm = confusion_matrix(y_test, y_pred_classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()

    # Save the image
    plt.savefig('confusion_matrix.png')

    # Générer le rapport
    report_dict = classification_report(y_test, y_pred_classes, output_dict=True)

    # Le convertir en DataFrame
    report_df = pd.DataFrame(report_dict).transpose()

    # Save metrics into a DataFrame
    metrics_df = pd.DataFrame({'loss': [loss], 'accuracy': [accuracy]})
    
    return metrics_df, history, report_df


In [0]:
y_encrypted = df['label']
X_encrypted, y_encrypted = preprocess_data(df[['tweet_length_chars', 'tweet_length_words', 'text']], y_encrypted)

metrics_encrypted, history, report_df = apply_and_evaluate_deep_learning_model(X_encrypted, y_encrypted)
print(f"Metrics on encrypted data:")
metrics_encrypted.head()
