## Import Packages

In [1]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import pandas as pd
import sys
import os

from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
from detector.labeler import EnronLabeler, MismatchLabeler
from ethics.differential_privacy import RandomForestPrivacyModel
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse


[nltk_data] Downloading package stopwords to
[nltk_data]     /common/home/ps1279/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Init wandb for model tracking

In [2]:
wandbdict = {
    'key': os.getenv('WANDB_API_KEY'),
    'entity': os.getenv('WANDB_ENTITY'),
    'project': os.getenv('WANDB_PROJECT'),
}
wandb.login(key=wandbdict['key'])
run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])

12/05/2023 23:39:06:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33madvaithrao[0m ([33mregressors[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /common/home/ps1279/.netrc


## Load Data

In [3]:
data = pd.read_csv('./data/fraud_detector_data.csv')

## Data Splits

In [4]:
train_data = data[data.Split == 'Train']
sanity_data = data[data.Split == 'Sanity']
gold_fraud_data = data[data.Split == 'Gold Fraud']

## Load Model

In [5]:
hyper_params = {
    'num_labels': 2,
    'n_estimators': 100,
    'criterion': 'gini'
}

model = RandomForestPrivacyModel()

In [11]:
# run = wandb.init(config=hyper_params)



VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112813154856365, max=1.0…

## Run Augmentation

In [12]:

augmentor = Augmentor()

train_body, train_labels = augmentor(
    train_data['Body'].tolist(),
    train_data['Label'].tolist(),
    aug_label=1,
    num_aug_per_label_1=9,
    shuffle=True
)

train_data = pd.DataFrame(
    {
        'Body': train_body,
        'Label': train_labels
    }
)

train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)


In [13]:
train_data.to_csv('./data/augmented_train_data.csv', index=None)

## Train Model

In [6]:
train_data = pd.read_csv('./data/augmented_train_data.csv')

In [9]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

import sys
sys.path.append('..')

import matplotlib.pyplot as plt

import shutil
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
import torch
from torch import nn

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel
from transformers import AdamW,get_linear_schedule_with_warmup

from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from mlflow.sklearn import save_model
from scipy.sparse import hstack

from ethics.base import BaseDistilbertModel
from utils.util_modeler import Word2VecEmbedder, TPSampler, get_f1_score

from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager

from diffprivlib.models.forest import RandomForestClassifier


class RandomForestPrivacyModel:
    def __init__(
        self,
        num_labels: int = 2,
        n_estimators = 100,
        criterion = 'gini',
        njobs = -1
    ):
        self.num_labels = num_labels
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.njobs = njobs

        self.vectorizer = Word2VecEmbedder()
        
        self.model = Pipeline([
            ('vectorizer', self.vectorizer),
            ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs))
        ])

    def train(
        self,
        body: pd.Series | list[str],
        label: pd.Series | list[int],
        wandb: wandb = None,
    ):
        """Trains the SVM model.

        Args:
            body (pd.Series | list[str]): The body of the email.
            label (pd.Series | list[int]): The label of the email.

        Raises:
            ValueError: If the body and label are not of the same size.
        """
        if isinstance(body, pd.Series):
            body = body.tolist()
        if isinstance(label, pd.Series):
            label = label.tolist()

        body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)

        # Train the RF model
        epsilons = [1e-8, 1e-2, 1, 7.5, 20]
        accuracies = []

        for eps in epsilons:
            self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=eps)
            # self.model.set_params(classifier__epsilon=eps)
            self.model.fit(body_train, label_train)

            accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro')
            print('********* \n Epsilon %.2f - Validation F1 Score %.5f \n *********' % (eps, accuracy))
            accuracies.append(accuracy)
        
        plt.plot(epsilons, accuracies, marker='o')
        plt.xscale('log')  # Use a logarithmic scale for better visibility
        plt.xlabel('Epsilon')
        plt.ylabel('F1 Score')
        plt.title('F1 Score vs Epsilon')
        plt.grid(True)

        plt.savefig("rf_dp_accuracy_vs_epsilon_plot.png")

        # Log the plot to wandb
        wandb.log({"Accuracy vs Epsilon": plt})
        
        print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \n {"="*20}')
        
        #Fit model with best epsilon
        self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=epsilons[np.argmax(accuracies)])
        
        self.model.fit(body, label)

        print(f'{"="*20} Training Done {"="*20}')

    def predict(
        self,
        body: pd.Series | list[str],
    ):
        """Predicts the labels of the given data.

        Args:
            body (pd.Series | list[str]): The body of the email.

        Returns:
            np.array: The predictions of the model.
        """
        if isinstance(body, pd.Series):
            body = body.tolist()

        # Make predictions using the trained SVM model
        predictions = self.model.predict(body)

        if isinstance(predictions, np.ndarray):
            predictions = predictions.tolist()

        return predictions

    def save_model(
        self,
        path: str,
    ):
        """Saves the model to the given path.

        Args:
            path (str): The path to save the model to.
        """

        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
        
        save_model(self.model, path)

In [10]:
model = RandomForestPrivacyModel()

In [7]:

# Call your code that produces output
model.train(train_data['Body'], train_data['Label'], wandb = run)


AttributeError: 'Word2VecEmbedder' object has no attribute 'model_name'

In [14]:
f1_scores = {}
os.makedirs('/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/logs', exist_ok=True)
save_path='/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/'

## Predict on all datasets and generate logs + mismatch_data

In [None]:
train_data['Prediction'] = model.predict(body=train_data['Body'])
evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())

In [None]:
sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())



In [None]:
gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())



In [None]:
#save mismatch data into a csv file
mismatch_data = pd.concat(
    [
        train_data[train_data['Prediction'] != train_data['Label']],
        sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
        gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
    ],
    axis=0,
    ignore_index=True
)

mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)

## Save Logs

In [None]:
all_params = {**hyper_params, **f1_scores}
run.config.update(all_params)

In [None]:
logs_path = os.path.join(save_path,'logs')
log_artifact = wandb.Artifact("fraud-detector-logs", type="logs")
log_artifact.add_dir(logs_path)
run.use_artifact(log_artifact)

## Save Model

In [None]:
model.save_model(os.path.join(save_path,'model'))

In [None]:
model_path = os.path.join(save_path, 'model')
model_artifact = wandb.Artifact("fraud-detector-model", type="model")
model_artifact.add_dir(model_path)
run.use_artifact(model_artifact)

In [None]:
run.finish()