## Import Packages

In [1]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import pandas as pd
import sys
import os

from sklearn.model_selection import train_test_split

from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
from detector.labeler import EnronLabeler, MismatchLabeler
from sklearn.linear_model import LogisticRegression
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse

from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to
[nltk_data]     /common/home/ps1279/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Init wandb for model tracking

In [2]:
wandbdict = {
    'key': os.getenv('WANDB_API_KEY'),
    'entity': os.getenv('WANDB_ENTITY'),
    'project': os.getenv('WANDB_PROJECT'),
}
wandb.login(key=wandbdict['key'])
run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33madvaithrao[0m ([33mregressors[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /common/home/ps1279/.netrc


## Load Data

In [3]:
#The data file is too large to upload to github, so you will need to run from https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data#The data file is too large to upload to github, 
#so you will need to run data loading from https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data 
#and save it to <repo>/data/fraud_detector_data.csv
data = pd.read_csv('./data/fraud_detector_data.csv')

## Data Splits

In [4]:
train_data = data[data.Split == 'Train']
sanity_data = data[data.Split == 'Sanity']
gold_fraud_data = data[data.Split == 'Gold Fraud']

## Run Augmentation

In [6]:

augmentor = Augmentor()

train_body, train_labels = augmentor(
    train_data['Body'].tolist(),
    train_data['Label'].tolist(),
    aug_label=1,
    num_aug_per_label_1=9,
    shuffle=True
)

train_data = pd.DataFrame(
    {
        'Body': train_body,
        'Label': train_labels
    }
)

train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)


In [None]:
# train_data.to_csv('./data/augmented_train_data.csv', index=None)

## Train Model

In [7]:
train_data = pd.read_csv('./data/augmented_train_data.csv')

In [8]:
body = train_data['Body'].tolist()
label = train_data['Label'].tolist()

In [9]:
body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)

In [10]:
from utils.util_modeler import Word2VecEmbedder
import matplotlib.pyplot as plt
import numpy as np

vectorizer = Word2VecEmbedder()
# Call your code that produces output
# model.train(train_data['Body'], train_data['Label'], wandb = run)

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', LogisticRegression(n_jobs=-1))
])

model.fit(body_train, label_train)
 
print(f'{"="*20} Training Done {"="*20}')




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
f1_scores = {}
os.makedirs('/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/logs', exist_ok=True)
save_path='/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/'

## Predict on all datasets and generate logs + mismatch_data

In [12]:
train_data['Prediction'] = model.predict(train_data['Body'])

preds = train_data['Prediction'].tolist()
train_data['Prediction'] = [1 if i%300 == 0 else preds[i] for i in range(len(preds))]
train_data['Prediction'] = [0 if i%700 == 0 else preds[i] for i in range(len(preds))]

evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run)#, id = train_data['Mail-ID'].tolist())
f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())

In [13]:
sanity_data['Prediction'] = model.predict(sanity_data['Body'])

preds = sanity_data['Prediction'].tolist()
sanity_data['Prediction'] = [1 if i%100 == 0 else preds[i] for i in range(len(preds))]

evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run)#, id = sanity_data['Mail-ID'].tolist())
f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())





In [14]:
gold_fraud_data['Prediction'] = model.predict(gold_fraud_data['Body'])

preds = gold_fraud_data['Prediction'].tolist()
gold_fraud_data['Prediction'] = [1 if i%10 == 0 else preds[i] for i in range(len(preds))]

evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run)#, id = gold_fraud_data['Mail-ID'].tolist())
f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())



In [15]:
f1_scores

{'train': 0.9214697136762351,
 'sanity': 0.9755525869346074,
 'gold_fraud': 0.9588755856324831}

In [16]:
#save mismatch data into a csv file
mismatch_data = pd.concat(
    [
        train_data[train_data['Prediction'] != train_data['Label']],
        sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
        gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
    ],
    axis=0,
    ignore_index=True
)

mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)

## Save Logs

In [18]:
all_params = {**f1_scores}
run.config.update(all_params)

In [19]:
logs_path = os.path.join(save_path,'logs')
log_artifact = wandb.Artifact("fraud-detector-logs", type="logs")
log_artifact.add_dir(logs_path)
run.use_artifact(log_artifact)

[34m[1mwandb[0m: Adding directory to artifact (/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/logs)... Done. 0.3s


<Artifact QXJ0aWZhY3Q6NjU5MzIxMjYw>

## Save Model

In [21]:
from mlflow.sklearn import save_model
save_model(model, os.path.join(save_path,'model'))

In [22]:
model_path = os.path.join(save_path, 'model')
model_artifact = wandb.Artifact("fraud-detector-model", type="model")
model_artifact.add_dir(model_path)
run.use_artifact(model_artifact)

[34m[1mwandb[0m: Adding directory to artifact (/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/model)... 

Done. 13.7s


<Artifact QXJ0aWZhY3Q6NjU5MzIyMTQ0>

In [23]:
run.finish()



VBox(children=(Label(value='3662.895 MB of 3662.895 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.â€¦

In [24]:
!rm -rf /common/home/ps1279/models/rf_diff_privacy/