In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
prepared_tweets_encryption = dataiku.Dataset("prepared_tweets_encryption")
prepared_tweets_encryption_df = prepared_tweets_encryption.get_dataframe()

prepared_tweets_removal = dataiku.Dataset("prepared_tweets_removal")
prepared_tweets_removal_df = prepared_tweets_removal.get_dataframe()

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import tensorflow as tf

def preprocess_data(X: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    Preprocesses the data by handling categorical variables with one-hot encoding,
    numerical columns with standard scaling, and text data with tokenization and padding.

    Parameters:
    X (pd.DataFrame): The data to preprocess.

    Returns:
    pd.DataFrame: The processed numerical and categorical data.
    pd.DataFrame: The processed text data.
    """
    # Handle categorical variables with one-hot encoding
    categorical_features = ['date', 'user', 'language']
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_categorical = encoder.fit_transform(X[categorical_features])
    
    # Handle numerical columns with standard scaling
    numerical_features = X.drop(columns=categorical_features + ['text']).columns
    scaler = StandardScaler()
    X_numerical = scaler.fit_transform(X[numerical_features])
    
    # Handle text data with tokenization and padding
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X['text'])
    X_text = tokenizer.texts_to_sequences(X['text'])
    X_text = pad_sequences(X_text, maxlen=100)
    
    # Concatenate processed categorical and numerical features
    X_processed = pd.concat([pd.DataFrame(X_numerical), pd.DataFrame(X_categorical)], axis=1)
    
    return X_processed, X_text

def apply_and_evaluate_deep_learning_model(X: pd.DataFrame, X_text: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    """
    Applies a deep learning model to the preprocessed data and evaluates its performance.

    Parameters:
    X (pd.DataFrame): The preprocessed numerical and categorical data.
    X_text (pd.DataFrame): The preprocessed text data.
    y (pd.Series): The labels.

    Returns:
    pd.DataFrame: A DataFrame containing the loss and accuracy of the model on the test data.
    """
    # Split the data into training and testing sets
    X_train, X_test, X_text_train, X_text_test, y_train, y_test = train_test_split(X, X_text, y, test_size=0.2, random_state=42)

    # Convert processed data to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices(((X_train.values, X_text_train), y_train.values))
    test_dataset = tf.data.Dataset.from_tensor_slices(((X_test.values, X_text_test), y_test.values))

    # Batch the datasets
    train_dataset = train_dataset.batch(32)
    test_dataset = test_dataset.batch(32)

    # Initialize the model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    # Train the model
    model.fit(train_dataset, epochs=10, validation_data=test_dataset, verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(test_dataset, verbose=0)
    
    # Save metrics into a DataFrame
    metrics_df = pd.DataFrame({'loss': [loss], 'accuracy': [accuracy]})
    
    return metrics_df


In [0]:
# Define features and target
features = [
    'date', 
    'user', 
    'language', 
    'tweet_length_chars', 
    'tweet_length_words', 
    'repetitive_letters', 
    'mention_only', 
    'unreadable', 
    'too_many_numbers',
    'text'
]


In [0]:
# Evaluate model on encrypted data
X_encrypted = prepared_tweets_encryption[features]
y_encrypted = prepared_tweets_encryption['label']
metrics_encrypted = apply_and_evaluate_deep_learning_model(X_encrypted, y_encrypted)
print(f"Metrics on encrypted data:")
metrics_encrypted.head()


In [0]:
# Evaluate model on removed data
X_removed = prepared_tweets_removal[features]
y_removed = prepared_tweets_removal['label']
metrics_removed = apply_and_evaluate_deep_learning_model(X_removed, y_removed)
print(f"Metrics on removed data:\n{metrics_removed}")
