In [37]:
import pandas as pd
import numpy as np


# 1. Data Loading

In [38]:
df = pd.read_csv("clean_transaction_dataset.csv")

# 2. Data Preprocessing

- The dataset is poorly labelled, relabel it

In [39]:
not_senstive_df = df.iloc[0:99]



In [40]:
not_senstive_df['Sensitive'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_senstive_df['Sensitive'] = 0


In [41]:
senstive_df = df.iloc[100:-1]

In [42]:
senstive_df['Sensitive'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senstive_df['Sensitive'] = 1


In [43]:
df = pd.concat([not_senstive_df, senstive_df], axis=0)

In [44]:
df

Unnamed: 0,UserId,ItemDescription,Remarks,Sensitive
0,278166,FAMILY ALBUM WHITE PICTURE FRAME,Not Sensitive,0
1,337701,LONDON BUS COFFEE MUG,Not Sensitive,0
2,267099,SET 12 COLOUR PENCILS DOLLY GIRL,Not Sensitive,0
3,380478,UNION JACK FLAG LUGGAGE TAG,Not Sensitive,0
4,285957,CUT GLASS T-LIGHT HOLDER OCTAGON,Not Sensitive,0
...,...,...,...,...
145,46,CORRUPT GOVERNMENT CONTRACTS,Unknown,1
146,47,PAYMENT FOR HACKED ACCOUNTS,sensitive,1
147,48,UNLAWFUL WEAPON SALES,sensitive,1
148,49,DANGEROUS MATERIALS PROCUREMENT,sensitive,1


In [45]:
df['Sensitive'].value_counts()

Sensitive
0    99
1    50
Name: count, dtype: int64

# 3. Text Feature Engineering

In [46]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5",
)

In [47]:
len(embeddings.embed_query("I bought a gun"))

768

In [48]:
from tqdm import tqdm

text_embedding_data = []

for idx, row in tqdm(df.iterrows(), desc="Getting text embedding", total=len(df)):
    item_desc = row['ItemDescription']
    text_embedding = embeddings.embed_query(item_desc)
    text_embedding_data.append(text_embedding)


Getting text embedding: 100%|██████████| 149/149 [00:04<00:00, 32.08it/s]


In [49]:
text_embedding_data = np.array(text_embedding_data)

In [50]:
text_embedding_data

array([[ 0.03581212,  0.03731619, -0.15451594, ..., -0.02222542,
        -0.03667499, -0.05549917],
       [-0.09563636,  0.04327139, -0.16282973, ...,  0.00585668,
        -0.02714618, -0.00677296],
       [-0.01679943,  0.0253162 , -0.16049893, ..., -0.02367709,
        -0.05071287, -0.04589411],
       ...,
       [ 0.05693637,  0.00494702, -0.17901994, ..., -0.12047769,
        -0.04080215, -0.03771562],
       [ 0.04129224,  0.02608795, -0.15969811, ..., -0.04406471,
        -0.07248797,  0.01612846],
       [ 0.06898017,  0.03684918, -0.17096555, ..., -0.11579931,
        -0.04120715, -0.01427934]])

# 3. Model Training

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    text_embedding_data, df['Sensitive'], test_size=0.2, random_state=42)

In [52]:
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Assuming text_embedding_data is your feature matrix and df['Sensitive'] is the target label

# Train an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_classifier.fit(X_train, y_train)




Parameters: { "use_label_encoder" } are not used.



In [53]:
X_test, y_test

(array([[-2.4573244e-02,  8.5849770e-02, -1.8656318e-01, ...,
         -1.2933937e-03, -2.0004772e-02,  4.3943300e-03],
        [-3.2439672e-03,  1.1661522e-01, -1.9890535e-01, ...,
         -4.7694850e-02, -4.9916226e-02, -4.2003218e-02],
        [ 5.2919553e-05,  4.1715833e-03, -1.6535324e-01, ...,
         -1.8097915e-02, -5.8475714e-02,  1.7441355e-02],
        ...,
        [ 4.3610953e-02,  6.0421700e-02, -1.7036189e-01, ...,
         -4.9047400e-02, -2.9304060e-02,  1.9637216e-02],
        [-4.1467097e-02,  5.9614210e-02, -1.5060940e-01, ...,
         -1.6776258e-02, -4.5237217e-02, -9.7563680e-03],
        [-1.6608026e-02,  7.2956980e-02, -1.6667415e-01, ...,
         -2.2839993e-02, -1.3755956e-03,  2.4094693e-03]]),
 73     0
 18     0
 118    1
 78     0
 76     0
 31     0
 64     0
 141    1
 68     0
 82     0
 110    1
 12     0
 36     0
 9      0
 19     0
 56     0
 137    1
 69     0
 55     0
 132    1
 29     0
 124    1
 26     0
 128    1
 129    1
 145    1
 111 

In [55]:
from sklearn.metrics import classification_report

# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Print evaluation results
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print("\nClassification Report:\n")
print(classification_rep)


Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [56]:
import joblib


joblib.dump(value=xgb_classifier, filename="xgboost.pkl")

['xgboost.pkl']

# 4. Test the model

In [57]:
import joblib
import xgboost as xgb
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5",
)

xgb_classifier = joblib.load(filename="xgboost.pkl")

In [58]:
def predict_sensitive(
    text: str
) -> None:
    text_embedding = embeddings.embed_query(text)
    prediction = xgb_classifier.predict([text_embedding])[0]
    probability = xgb_classifier.predict_proba([text_embedding])
    print(probability)
    if prediction == 1:
        print("Sensitive")
    else:
        print("Not Sensitive")

In [59]:
text = "Sending money for your birthday"
predict_sensitive(text)

[[0.33611214 0.66388786]]
Sensitive


In [60]:
text = "Hope this money is sufficient for you to buy a plastic gun for your school project"
predict_sensitive(text)

[[0.42837566 0.57162434]]
Sensitive


In [61]:
text = "Donate money for religion and prayer\’s purpose" 
predict_sensitive(text) 

[[0.46703506 0.53296494]]
Sensitive


In [62]:
text = "For Family Support" 
predict_sensitive(text)

[[0.7684273 0.2315727]]
Not Sensitive


In [63]:
text = "GUNS MATERIALS PROCUREMENT" 
predict_sensitive(text)

[[0.09427112 0.9057289 ]]
Sensitive


In [64]:
text = "AK-47 MATERIALS PROCUREMENT" 
predict_sensitive(text)

[[0.06205571 0.9379443 ]]
Sensitive


In [65]:
text = "drug for friend" 
predict_sensitive(text)

[[0.7787821  0.22121784]]
Not Sensitive


In [66]:
text = "Grenade and gun for friend" 
predict_sensitive(text)

[[0.8278579 0.1721421]]
Not Sensitive


In [67]:
text = "Nuclear war for my islam !!!!" 
predict_sensitive(text)

[[0.8738301  0.12616992]]
Not Sensitive
