In [41]:
import pandas as pd
import numpy as np


# 1. Data Loading

In [2]:
df = pd.read_csv("clean_transaction_dataset.csv")

# 2. Data Preprocessing

- The dataset is poorly labelled, relabel it

In [11]:
not_senstive_df = df.iloc[0:99]



In [15]:
not_senstive_df['Sensitive'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_senstive_df['Sensitive'] = 1


In [13]:
senstive_df = df.iloc[100:-1]

In [17]:
senstive_df['Sensitive'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senstive_df['Sensitive'] = 0


In [20]:
df = pd.concat([not_senstive_df, senstive_df], axis=0)

In [21]:
df

Unnamed: 0,UserId,ItemDescription,Remarks,Sensitive
0,278166,FAMILY ALBUM WHITE PICTURE FRAME,Not Sensitive,1
1,337701,LONDON BUS COFFEE MUG,Not Sensitive,1
2,267099,SET 12 COLOUR PENCILS DOLLY GIRL,Not Sensitive,1
3,380478,UNION JACK FLAG LUGGAGE TAG,Not Sensitive,1
4,285957,CUT GLASS T-LIGHT HOLDER OCTAGON,Not Sensitive,1
...,...,...,...,...
145,46,CORRUPT GOVERNMENT CONTRACTS,Unknown,0
146,47,PAYMENT FOR HACKED ACCOUNTS,sensitive,0
147,48,UNLAWFUL WEAPON SALES,sensitive,0
148,49,DANGEROUS MATERIALS PROCUREMENT,sensitive,0


In [23]:
df['Sensitive'].value_counts()

Sensitive
1    99
0    50
Name: count, dtype: int64

# 3. Text Feature Engineering

In [24]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5",
)

In [27]:
len(embeddings.embed_query("I bought a gun"))

768

In [39]:
from tqdm import tqdm

text_embedding_data = []

for idx, row in tqdm(df.iterrows(), desc="Getting text embedding", total=len(df)):
    item_desc = row['ItemDescription']
    text_embedding = embeddings.embed_query(item_desc)
    text_embedding_data.append(text_embedding)


Getting text embedding: 100%|██████████| 149/149 [00:04<00:00, 32.75it/s]


In [43]:
text_embedding_data = np.array(text_embedding_data)

In [None]:
text_embedding_data

# 3. Model Training

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    text_embedding_data, df['Sensitive'], test_size=0.2, random_state=42)

In [90]:
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming text_embedding_data is your feature matrix and df['Sensitive'] is the target label

# Train an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_classifier.fit(X_train, y_train)




Parameters: { "use_label_encoder" } are not used.



In [91]:
# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Precision: 1.00
Recall: 1.00
F1 Score: 1.00


In [92]:
import joblib


joblib.dump(value=xgb_classifier, filename="xgboost.pkl")

['xgboost.pkl']

# 4. Test the model

In [93]:
import joblib
import xgboost as xgb
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5",
)

xgb_classifier = joblib.load(filename="xgboost.pkl")

In [106]:
def predict_sensitive(
    text: str
) -> str:
    text_embedding = embeddings.embed_query(text)
    prediction = xgb_classifier.predict([text_embedding])[0]
    probability = xgb_classifier.predict_proba([text_embedding])
    print(probability)
    if prediction == 1:
        print("Sensitive")
    else:
        print("Not Sensitive")

In [107]:
text = "Sending money for your birthday"
predict_sensitive(text)

[[0.66388786 0.3361121 ]]
Not Sensitive


In [108]:
text = "Hope this money is sufficient for you to buy a plastic gun for your school project"
predict_sensitive(text)

[[0.5716244  0.42837557]]
Not Sensitive


In [109]:
text = "Donate money for religion and prayer\’s purpose" 
predict_sensitive(text) 

[[0.53296506 0.4670349 ]]
Not Sensitive


In [110]:
text = "For Family Support" 
predict_sensitive(text)

[[0.2315728 0.7684272]]
Sensitive


In [111]:
text = "GUNS MATERIALS PROCUREMENT" 
predict_sensitive(text)

[[0.9057289  0.09427114]]
Not Sensitive


In [112]:
text = "AK-47 MATERIALS PROCUREMENT" 
predict_sensitive(text)

[[0.9379443  0.06205573]]
Not Sensitive


In [113]:
text = "drug for friend" 
predict_sensitive(text)

[[0.22121793 0.77878207]]
Sensitive


In [114]:
text = "Grenade and gun for friend" 
predict_sensitive(text)

[[0.17214227 0.82785773]]
Sensitive


In [115]:
text = "Nuclear war for my islam !!!!" 
predict_sensitive(text)

[[0.12616998 0.87383   ]]
Sensitive
