## Import Packages

In [28]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import pandas as pd
import sys
import os

from sklearn.model_selection import train_test_split

from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
from detector.labeler import EnronLabeler, MismatchLabeler
from sklearn.naive_bayes import GaussianNB
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse

from sklearn.pipeline import Pipeline


## Init wandb for model tracking

In [29]:
wandbdict = {
    'key': os.getenv('WANDB_API_KEY'),
    'entity': os.getenv('WANDB_ENTITY'),
    'project': os.getenv('WANDB_PROJECT'),
}
wandb.login(key=wandbdict['key'])
run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /common/home/ps1279/.netrc


## Load Data

In [30]:
#The data file is too large to upload to github, so you will need to run from https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data#The data file is too large to upload to github, 
#so you will need to run data loading from https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data 
#and save it to <repo>/data/fraud_detector_data.csv
data = pd.read_csv('./data/fraud_detector_data.csv')

## Data Splits

In [31]:
train_data = data[data.Split == 'Train']
sanity_data = data[data.Split == 'Sanity']
gold_fraud_data = data[data.Split == 'Gold Fraud']

## Run Augmentation

In [6]:

augmentor = Augmentor()

train_body, train_labels = augmentor(
    train_data['Body'].tolist(),
    train_data['Label'].tolist(),
    aug_label=1,
    num_aug_per_label_1=9,
    shuffle=True
)

train_data = pd.DataFrame(
    {
        'Body': train_body,
        'Label': train_labels
    }
)

train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)


In [None]:
# train_data.to_csv('./data/augmented_train_data.csv', index=None)

## Train Model

In [34]:
# train_data = pd.read_csv('./data/augmented_train_data.csv')

In [35]:
body = train_data['Body'].tolist()
label = train_data['Label'].tolist()

In [36]:
body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)

In [37]:
from utils.util_modeler import Word2VecEmbedder
import matplotlib.pyplot as plt
import numpy as np

vectorizer = Word2VecEmbedder()
# Call your code that produces output
# model.train(train_data['Body'], train_data['Label'], wandb = run)

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', GaussianNB())
])

model.fit(body_train, label_train)
 
print(f'{"="*20} Training Done {"="*20}')




In [39]:
f1_scores = {}
os.makedirs('/common/home/ps1279/models/rf_diff_privacy/2023-12-05/nb_diff_privacy/logs', exist_ok=True)
save_path='/common/home/ps1279/models/rf_diff_privacy/2023-12-05/nb_diff_privacy/'

## Predict on all datasets and generate logs + mismatch_data

In [40]:
train_data['Prediction'] = model.predict(train_data['Body'])

evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run)#, id = train_data['Mail-ID'].tolist())
f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())



In [41]:
sanity_data['Prediction'] = model.predict(sanity_data['Body'])


evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run)#, id = sanity_data['Mail-ID'].tolist())
f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())





In [42]:
gold_fraud_data['Prediction'] = model.predict(gold_fraud_data['Body'])


evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run)#, id = gold_fraud_data['Mail-ID'].tolist())
f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())



In [43]:
f1_scores

{'train': 0.6698002526272635,
 'sanity': 0.830756710871858,
 'gold_fraud': 0.9029072956664838}

In [44]:
#save mismatch data into a csv file
mismatch_data = pd.concat(
    [
        train_data[train_data['Prediction'] != train_data['Label']],
        sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
        gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
    ],
    axis=0,
    ignore_index=True
)

mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)

## Save Logs

In [45]:
all_params = {**f1_scores}
run.config.update(all_params)

In [46]:
logs_path = os.path.join(save_path,'logs')
log_artifact = wandb.Artifact("fraud-detector-logs", type="logs")
log_artifact.add_dir(logs_path)
run.use_artifact(log_artifact)

[34m[1mwandb[0m: Adding directory to artifact (/common/home/ps1279/models/rf_diff_privacy/2023-12-05/nb_diff_privacy/logs)... Done. 1.8s


<Artifact QXJ0aWZhY3Q6NjU5NTcxNDA2>

## Save Model

In [53]:
from mlflow.sklearn import save_model
save_model(model, os.path.join(save_path,'model'))

In [54]:
model_path = os.path.join(save_path, 'model')
model_artifact = wandb.Artifact("fraud-detector-model", type="model")
model_artifact.add_dir(model_path)
run.use_artifact(model_artifact)

In [None]:
!rm -rf /common/home/ps1279/models/rf_diff_privacy/2023-12-05/nb_diff_privacy/

In [55]:
run.finish()

VBox(children=(Label(value='848.373 MB of 848.373 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))