In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

def preprocess_data(X: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses the data by handling categorical variables with one-hot encoding
    and numerical columns with standard scaling.

    Parameters:
    X (pd.DataFrame): The data to preprocess.

    Returns:
    pd.DataFrame: The processed data.
    """
    # Handle categorical variables with one-hot encoding
    categorical_features = ['date', 'user', 'language']
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    X_categorical = encoder.fit_transform(X[categorical_features])
    
    # Handle numerical columns with standard scaling
    numerical_features = X.drop(columns=categorical_features).columns
    scaler = StandardScaler()
    
    X_numerical = scaler.fit_transform(X[numerical_features])
    
    # Concatenate processed categorical and numerical features
    X_processed = pd.concat([pd.DataFrame(X_numerical), pd.DataFrame(X_categorical)], axis=1)
    
    return X_processed

def apply_and_evaluate_deep_learning_model(X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
    """
    Applies a deep learning model to the preprocessed data and evaluates its performance.

    Parameters:
    X (pd.DataFrame): The preprocessed data.
    y (pd.Series): The labels.

    Returns:
    pd.DataFrame: A DataFrame containing the loss and accuracy of the model on the test data.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocess the data
    X_train_processed = preprocess_data(X_train)
    X_test_processed = preprocess_data(X_test)
    input_shape = X_train_processed.shape[1]

    # Convert processed data to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_processed.values, y_train.values))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test_processed.values, y_test.values))

    # Batch the datasets
    train_dataset = train_dataset.batch(32)
    test_dataset = test_dataset.batch(32)

    # Initialize the model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

    # Train the model
    model.fit(train_dataset, epochs=10, validation_data=test_dataset, verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate(test_dataset, verbose=0)
    
    # Save metrics into a DataFrame
    metrics_df = pd.DataFrame({'loss': [loss], 'accuracy': [accuracy]})
    
    return metrics_df


In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

In [0]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("mydataset")
mydataset_df = mydataset.get_dataframe()