In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/model-financial-fraud-subcategories/model_3.csv")

In [2]:
df.columns

Index(['Target Label', 'crimeaditionalinfo'], dtype='object')

In [3]:
subcategory_counts = df.groupby("Target Label").size()
print(subcategory_counts)

Target Label
Business Email CompromiseEmail Takeover      267
DebitCredit Card FraudSim Swap Fraud        7871
DematDepository Fraud                        665
EWallet Related Fraud                       3647
Fraud CallVishing                           5447
Internet Banking Related Fraud              6457
UPI Related Frauds                         16941
dtype: int64


In [4]:
!pip install imbalanced-learn




In [6]:
import pandas as pd

# Load the dataset from CSV (update the filename accordingly)
 

# Ensure text column is string
train_texts = df["crimeaditionalinfo"].astype(str).tolist()  # Text data

# Convert labels to numeric format
train_labels = df["Target Label"].astype("category").cat.codes.tolist()

In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


# ✅ Step 2: Convert Text Data to TF-IDF Features
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocab size to 5000
X = vectorizer.fit_transform(df["crimeaditionalinfo"].astype(str))  # Convert text to numerical format
y = df["Target Label"].astype("category").cat.codes  # Convert categorical labels to numeric

# ✅ Step 3: Apply SMOTE to Balance the Dataset
smote = SMOTE(sampling_strategy="auto", random_state=42)  # Auto balances all classes
X_resampled, y_resampled = smote.fit_resample(X, y)

# ✅ Step 4: Split into Train-Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("SMOTE applied successfully! New class distribution:")
print(pd.Series(y_resampled).value_counts())  # Show new balanced class counts


SMOTE applied successfully! New class distribution:
4    16941
6    16941
1    16941
5    16941
3    16941
0    16941
2    16941
Name: count, dtype: int64


In [10]:
df.head

<bound method NDFrame.head of                                Target Label  \
0                         Fraud CallVishing   
1                         Fraud CallVishing   
2                        UPI Related Frauds   
3                         Fraud CallVishing   
4      DebitCredit Card FraudSim Swap Fraud   
...                                     ...   
41290  DebitCredit Card FraudSim Swap Fraud   
41291        Internet Banking Related Fraud   
41292                 EWallet Related Fraud   
41293                    UPI Related Frauds   
41294                     Fraud CallVishing   

                                      crimeaditionalinfo  
0      the above fraudster is continuously messaging ...  
1      i received a call from lady stating that she w...  
2      fraud upi paytm bank punjab national bank incl...  
3      sir i am prabhat singh jat an app on playstore...  
4      fraud through debitcredit card fraudsim swap f...  
...                                                

In [14]:
df.to_csv("smote-data.csv", index = False)

In [21]:
ewallet_fraud_df = df[df["Target Label"] == "EWallet Related Fraud"]

# ✅ Display the first few rows
ewallet_fraud_df.head(100)

Unnamed: 0,Target Label,crimeaditionalinfo
12,EWallet Related Fraud,hello please help me and more like me who are suffering financial frauds from these kind of applications i only downloaded the app without my consent they gave me loan of and credited amount in my account for only days they are asking for payment of and even now i had done my payment still no update is done in application still showing for payment and the customer who was earlier taalking to me on whatsapp now has switched off her phone and they have accessed all my contacts and are threatning me to call them i am too much frustated please help this is the last hope from you all and i tried everywhere but i did not get any response from any source since the payment is not showing in qpplicatun they will ask for bounce charge and threaten me to callplease look into the matter not only me is trapped in this there could be number of poor persons like me so at last please help
17,EWallet Related Fraud,citizen details address name bhagchand saini mobile no dist dausa ps sikandra email id bhagchandmaligmailcom pincode bank state bank of india ac no branch bhandarange citizen ke sath total fraud amount rs date jan fraud by phone pay no transction id t utr amount rs time pm frauder detals suchna karta ne bataya ki unke pass unhone no se call aaya or jankar bankar ki baat ki or paise transfer karva liye
35,EWallet Related Fraud,i had raised query to the flipkart and they had told me to figure this out but now they are taking more than expected time and also since they paid the money through flipkart pay later now i am recieving the payment reminder worth rs of which i have not purchased anything
64,EWallet Related Fraud,district jaipur ps jhotwara add phul nager name puspendra singh shekhawat pincode furad by phone pe transaction id t utr suchnkarta ke phone pe no suchnkarta ke ac no bank of baroda frauder ke phone no date time pm pssgmailcom total fraud amount
73,EWallet Related Fraud,the caller mobile number and before and after the transaction was donehe understood i had blocked an reported his number the app used was sms forwarding appdidnt realise it could read my messagesbefore i understood he had engaged me on callmultiple otps came before i could cut or block the sms i rcvd the transaction of on accoun tof rent was made transaction sms and app number
85,EWallet Related Fraud,the amount has been debitted from zest money and amazon pay gift cards has been generated fraudulently gift card code gift card pin aqtkz gift card amt gift card code gift card pin tvutvjtys gift card amt sir i am a student please help me
103,EWallet Related Fraud,dear sir please refer the above information these we have and with these my belowed team members also loosed much money so please take action on this and also take necessary action on unauthorised these type frod actives it may not be happen for any others in feature also please support to release the lossed money thanks regards thota sreekanth
104,EWallet Related Fraud,name subesingh mno paytm no bank ac no sbi ifsc sbin add gav dabri tes bhadra dist hanumangarh pincode ps bsirani fraud by paytm a reference no date time pm b reference no date time pm total fraud fraud no suchna karta ne btaya hai ki jankaar ban kar fraud kiya gaya
109,EWallet Related Fraud,i received a call saying they are from lic office and they have a maturity maount of rs from my father and they want to send it to me as policy had my name they asked me to open phonepe and accept the mony request for rs after i accepted money request i received in my account they showed it to me again and rd time i got money request for rs rd time that money was deducted from my account and i never got back my money
110,EWallet Related Fraud,my investment was at phonepe on th dec the total amount was rs and i withdrew rs on th dec the next day i got the final amount so my rs was got fraud kindly help me regards sameer


In [23]:
ewallet_fraud_df.to_csv("ewallet-fraud.csv", index =False)

In [11]:
df.head(41290)

Unnamed: 0,Target Label,crimeaditionalinfo
0,Fraud CallVishing,the above fraudster is continuously messaging ...
1,Fraud CallVishing,i received a call from lady stating that she w...
2,UPI Related Frauds,fraud upi paytm bank punjab national bank incl...
3,Fraud CallVishing,sir i am prabhat singh jat an app on playstore...
4,DebitCredit Card FraudSim Swap Fraud,fraud through debitcredit card fraudsim swap f...
...,...,...
41285,Internet Banking Related Fraud,fraud through internet banking bank punjab nat...
41286,UPI Related Frauds,we heard about gaffar tent house from justdial...
41287,EWallet Related Fraud,name tara chand email id tarachandkarwagmailco...
41288,UPI Related Frauds,i went to sell my furniture over olx some one ...


In [9]:

inverted_texts = vectorizer.inverse_transform(X_resampled)

# Show first 5 resampled vectors as text
for i, words in enumerate(inverted_texts[:5]):
    print(f"Text {i+1}: {' '.join(words)}")

Text 1: help please phone from list recieved has how unaware am whatsapp through contacts other my of photos nude fake send will he or money him pay to asking and me messaging continuously is fraudster above the
Text 2: mobile any not have where office head february th on post parcel new she that stating lady call received phone from through of send will to and
Text 3: action necessary take complaint in reverse hold amount total number account india united commerce oriental including national punjab bank paytm upi fraud please of and the
Text 4: commit closed if able suicide do forced defaming strict harrasing mentally abusing relatives calling are repayment contact copied they this loan for applied never been name which playstore app an jat singh sir action take mobile not have on please phone from list am contacts my of to asking and me continuously is
Text 5: no payments swap fraudsim card debitcredit action necessary take complaint in reverse hold amount total account bank fraud pl

In [14]:
X_resampled = X_resampled.toarray() if hasattr(X_resampled, "toarray") else X_resampled

# ---- Convert Back to DataFrame ----
df_resampled = pd.DataFrame({'crimeaditionalinfo': X_resampled.flatten(), 'category_encoded': y_resampled})

# ---- Decode Labels ----
df_resampled['category'] = label_encoder.inverse_transform(df_resampled['category_encoded'])

# ---- Save Oversampled Data ----
df_resampled.to_csv('smote_data.csv', index=False)

# ---- Check Class Distribution After SMOTE ----
print("\nClass Distribution After SMOTE:")
print(df_resampled['category'].value_counts())

ValueError: array length 592935000 does not match index length 118587

In [62]:
!pip install transformers datasets torch scikit-learn accelerate




In [2]:
import pandas as pd
train_df = pd.read_csv("/kaggle/input/cleaned-data-csv/cleaned_data.csv")

In [1]:
 ✅ Convert Vectors Back to Text Representation
inverted_texts = vectorizer.inverse_transform(X_resampled)

# Show first 5 resampled vectors as text
for i, words in enumerate(inverted_texts[:5]):
    print(f"Text {i+1}: {' '.join(words)}")

SyntaxError: invalid character '✅' (U+2705) (<ipython-input-1-b78c4c71f54b>, line 1)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
import numpy as np
from typing import List
import torch.optim as optim


# ---- Basic Tokenizer ----
class CustomTokenizer:
    def __init__(self, texts, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word_counts = Counter()
        self.build_vocab(texts)
    
    def build_vocab(self, texts):
        all_tokens = []
        for text in texts:
            tokens = self.tokenize(text)
            all_tokens.extend(tokens)
        self.word_counts = Counter(all_tokens)
        
        # Reserve special tokens: [PAD], [UNK]
        most_common = self.word_counts.most_common(self.vocab_size - 2)
        self.word2idx = {"[PAD]": 0, "[UNK]": 1}
        self.word2idx.update({word: i + 2 for i, (word, _) in enumerate(most_common)})
        self.idx2word = {i: word for word, i in self.word2idx.items()}
    
    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text.lower())  # Simple word tokenization
    
    def encode(self, text, max_length=512):
        tokens = self.tokenize(text)
        token_ids = [self.word2idx.get(token, 1) for token in tokens]  # 1 = [UNK]
        token_ids = token_ids[:max_length] + [0] * (max_length - len(token_ids))  # Pad to max_length
        return token_ids
    
    def decode(self, token_ids):
        return " ".join([self.idx2word.get(idx, "[UNK]") for idx in token_ids])

# ---- Transformer Components (Same as Before) ----
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, Q, K, V, mask=None):
        attention = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e9)
        attention = F.softmax(attention, dim=-1)
        return self.W_o(torch.matmul(attention, V))

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model))
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        x = self.norm1(x + self.dropout(self.self_attn(x, x, x, mask)))
        return self.norm2(x + self.dropout(self.feed_forward(x)))

class CyberCrimeBERT(nn.Module):
    def __init__(self, vocab_size: int, d_model: int = 256, num_heads: int = 8, 
                 num_layers: int = 6, d_ff: int = 512, max_seq_length: int = 512,
                 num_categories: int = 10):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Parameter(torch.zeros(1, max_seq_length, d_model))
        self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.classifier = nn.Sequential(nn.Linear(d_model, d_model), nn.Tanh(), nn.Linear(d_model, num_categories))
        
    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        x = self.embedding(input_ids) + self.position_embedding[:, :seq_length, :]
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        return self.classifier(x.mean(dim=1))

# ---- Custom Dataset ----
class CyberCrimeDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int = 512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tokenizer.encode(self.texts[idx], self.max_length))
        label = torch.tensor(self.labels[idx])
        return {'input_ids': input_ids, 'label': label}

# ---- Training Function ----
def train_model(model, train_loader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)  
            labels = batch['label'].to(device)  
            
            optimizer.zero_grad()
            outputs = model(input_ids)  
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

        print(f"Epoch {epoch+1} Completed, Average Loss: {total_loss/len(train_loader):.4f}")

# ---- Load Dataset and Train Model ----
def main():
    # Load dataset (Replace with actual file path)
    df = pd.read_csv('/kaggle/input/cleaned-data-csv/cleaned_data.csv')

    texts = df['crimeaditionalinfo'].astype(str).tolist()  # Ensure all are strings
    labels = df['category'].tolist()

# Map category labels to integers
    label_mapping = {label: idx for idx, label in enumerate(sorted(set(labels)))}
    num_categories = len(label_mapping)  

# Convert labels to integer indices
    labels = [label_mapping[label] for label in labels]

# Debugging: Print labels and check the range
    print(f"Label mapping: {label_mapping}")
    print(f"Unique label indices: {set(labels)}")

    tokenizer = CustomTokenizer(texts)
# Ensure labels are in the valid range
    assert all(0 <= lbl < num_categories for lbl in labels), "Label out of range!"
    dataset = CyberCrimeDataset(texts, labels, tokenizer)
    train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CyberCrimeBERT(vocab_size=len(tokenizer.word2idx), num_categories=num_categories).to(device)

    # Print GPU info
    if torch.cuda.is_available():
        print("Using:", torch.cuda.get_device_name(0))
        print("CUDA Version:", torch.version.cuda)
    else:
        print("No GPU found. Running on CPU.")

    # Train model
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    train_model(model, train_loader, optimizer, device, epochs=5)

if __name__ == "__main__":
    main()

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
import numpy as np
from typing import List
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ---- WordPiece Tokenizer ----
class WordPieceTokenizer:
    def __init__(self, texts, vocab_size=5000):
        self.vocab_size = vocab_size
        self.special_tokens = {"[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4}
        self.word2idx, self.idx2word = self.build_vocab(texts)
    
    def build_vocab(self, texts):
        word_freq = Counter()
        for text in texts:
            words = re.findall(r'\b\w+\b', text.lower())
            word_freq.update(words)
        
        subwords = {word: freq for word, freq in word_freq.items()}
        while len(subwords) < self.vocab_size - len(self.special_tokens):
            pairs = Counter()
            for word, freq in subwords.items():
                tokens = list(word)
                for i in range(len(tokens) - 1):
                    pairs[(tokens[i], tokens[i + 1])] += freq
            
            if not pairs:
                break
            best_pair = max(pairs, key=pairs.get)
            merged_token = best_pair[0] + best_pair[1]
            subwords[merged_token] = pairs[best_pair]
            
        word2idx = {**self.special_tokens, **{word: i + len(self.special_tokens) for i, word in enumerate(subwords.keys())}}
        idx2word = {i: word for word, i in word2idx.items()}
        return word2idx, idx2word
    
    def tokenize(self, text):
        words = re.findall(r'\b\w+\b', text.lower())
        tokens = []
        for word in words:
            if word in self.word2idx:
                tokens.append(word)
            else:
                sub_tokens = self.split_into_subwords(word)
                tokens.extend(sub_tokens)
        return tokens
    
    def split_into_subwords(self, word):
        subwords = []
        for i in range(len(word)):
            subword = word[i:]
            if subword in self.word2idx:
                subwords.append(subword)
        return subwords if subwords else ["[UNK]"]
    
    def encode(self, text, max_length=512):
        tokens = self.tokenize(text)
        token_ids = [self.word2idx.get(token, 1) for token in tokens]
        token_ids = token_ids[:max_length] + [0] * (max_length - len(token_ids))
        return token_ids
    
    def decode(self, token_ids):
        return " ".join([self.idx2word.get(idx, "[UNK]") for idx in token_ids])

# ---- Custom Dataset ----
class CyberCrimeDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int = 512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tokenizer.encode(self.texts[idx], self.max_length))
        label = torch.tensor(self.labels[idx])
        return {'input_ids': input_ids, 'label': label}

# ---- Model Definition ----
class CyberCrimeBERT(nn.Module):
    def __init__(self, vocab_size, num_categories, hidden_dim=128):
        super(CyberCrimeBERT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_categories)
    
    def forward(self, input_ids):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.lstm(x)
        logits = self.fc(h_n[-1])
        return logits

# ---- Training Function ----
def train_model(model, train_loader, optimizer, device, epochs=5):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# ---- Evaluation Metrics ----
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    return accuracy, precision, recall, f1

# ---- Load Dataset and Train Model ----
def main():
    df = pd.read_csv('/kaggle/input/cleaned-data-csv/cleaned_data.csv')
    texts = df['crimeaditionalinfo'].astype(str).tolist()
    labels = df['category'].tolist()
    label_mapping = {label: idx for idx, label in enumerate(sorted(set(labels)))}
    num_categories = len(label_mapping)
    labels = [label_mapping[label] for label in labels]
    tokenizer = WordPieceTokenizer(texts)
    dataset = CyberCrimeDataset(texts, labels, tokenizer)
    train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CyberCrimeBERT(vocab_size=len(tokenizer.word2idx), num_categories=num_categories).to(device)
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    train_model(model, train_loader, optimizer, device, epochs=5)
    evaluate_model(model, train_loader, device)

if __name__ == "__main__":
    main()


Epoch 1/5, Loss: 1.3802
Epoch 2/5, Loss: 1.3727
Epoch 3/5, Loss: 1.3726
Epoch 4/5, Loss: 1.3725
Epoch 5/5, Loss: 1.3727
Accuracy: 0.5916
Precision: 0.3500, Recall: 0.5916, F1-score: 0.4398


  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
print("Before SMOTE:", pd.Series(y).value_counts())
print("After SMOTE:", pd.Series(y_resampled).value_counts())
print("Training labels:", pd.Series(y_train).value_counts())


Before SMOTE: 6    16941
1     7871
5     6457
4     5447
3     3647
2      665
0      267
Name: count, dtype: int64
After SMOTE: 4    16941
6    16941
1    16941
5    16941
3    16941
0    16941
2    16941
Name: count, dtype: int64
Training labels: 4    13663
2    13591
5    13571
1    13547
6    13511
3    13509
0    13477
Name: count, dtype: int64


In [53]:
torch.save(model.state_dict(), "cybercrime_model.pth")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.optim as optim

# ---- Custom Dataset ----
class CyberCrimeDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.toarray(), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {'input_ids': self.features[idx], 'label': self.labels[idx]}

# ---- Model Definition ----
class CyberCrimeClassifier(nn.Module):
    def __init__(self, input_dim, num_categories, hidden_dim=128):
        super(CyberCrimeClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_categories)
    
    def forward(self, input_ids):
        x = F.relu(self.fc1(input_ids))
        logits = self.fc2(x)
        return logits

# ---- Training Function ----
def train_model(model, train_loader, optimizer, device, epochs=5):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# ---- Evaluation Metrics ----
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    return accuracy, precision, recall, f1

# ---- Load Dataset, Apply TF-IDF & SMOTE, and Train Model ----
def main():
    df = pd.read_csv('/kaggle/input/model-financial-fraud-subcategories/model_3.csv')
    
    # ✅ Convert Text Data to TF-IDF Features
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df["crimeaditionalinfo"].astype(str))
    y = df["Target Label"].astype("category").cat.codes

    # ✅ Apply SMOTE to Balance the Dataset
    smote = SMOTE(sampling_strategy="auto", random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # ✅ Split into Train-Test Sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    
    print("SMOTE applied successfully! New class distribution:")
    print(pd.Series(y_resampled).value_counts())

    # ✅ Prepare DataLoader
    train_dataset = CyberCrimeDataset(X_train, y_train.to_numpy())
    test_dataset = CyberCrimeDataset(X_test, y_test.to_numpy())

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # ✅ Initialize Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CyberCrimeClassifier(input_dim=X.shape[1], num_categories=len(set(y_resampled))).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=0.001)

    # ✅ Train & Evaluate
    train_model(model, train_loader, optimizer, device, epochs=7)
    evaluate_model(model, test_loader, device)

if __name__ == "__main__":
    main()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = CyberCrimeClassifier(input_dim=X.shape[1], num_categories=len(set(y_resampled))).to(device)
    model.load_state_dict(torch.load("cybercrime_model.pth", map_location=device))
    model.eval()


def predict_example(model, vectorizer, text, device):
        model.eval()
        with torch.no_grad():
        # Convert text to TF-IDF features
            input_features = vectorizer.transform([text])
            input_tensor = torch.tensor(input_features.toarray(), dtype=torch.float32).to(device)
        
        # Forward pass
            output = model(input_tensor)
            predicted_label = torch.argmax(output, dim=1).cpu().item()
          
            print(f"Predicted Category: {predicted_label}")

# ✅ Check if the model is trained
if "model" in globals():
    test_text = """toh pata chala ki mera SIM swap ho chuka hai. Yeh fraud hai, aur mujhe turant madad chahiye!"""

    predict_example(model, vectorizer, test_text, device)
else:
    print("❌ Model is not defined! Train the model first.")

In [54]:
import torch  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CyberCrimeClassifier(input_dim=X.shape[1], num_categories=len(set(y_resampled))).to(device)
model.load_state_dict(torch.load("cybercrime_model.pth", map_location=device))
model.eval()


def predict_example(model, vectorizer, text, device):
        model.eval()
        with torch.no_grad():
        # Convert text to TF-IDF features
            input_features = vectorizer.transform([text])
            input_tensor = torch.tensor(input_features.toarray(), dtype=torch.float32).to(device)
        
        # Forward pass
            output = model(input_tensor)
            predicted_label = torch.argmax(output, dim=1).cpu().item()
          
            print(f"Predicted Category: {predicted_label}")

# ✅ Check if the model is trained
if "model" in globals():
    test_text = """Mujhe apne Demat account se related ek badi samasya ka samna karna pad raha hai. Maine dekha ki mere shares bina meri ijazat ke bech diye gaye hain."""

    predict_example(model, vectorizer, test_text, device)
else:
    print("❌ Model is not defined! Train the model first.")

Predicted Category: 6


  model.load_state_dict(torch.load("cybercrime_model.pth", map_location=device))


In [38]:
df.columns

Index(['Target Label', 'crimeaditionalinfo'], dtype='object')

In [32]:
df["Target Label"].value_counts()


Target Label
UPI Related Frauds                         16941
DebitCredit Card FraudSim Swap Fraud        7871
Internet Banking Related Fraud              6457
Fraud CallVishing                           5447
EWallet Related Fraud                       3647
DematDepository Fraud                        665
Business Email CompromiseEmail Takeover      267
Name: count, dtype: int64

In [33]:
category_mapping = dict(enumerate(df["Target Label"].astype("category").cat.categories))
print(category_mapping)


{0: 'Business Email CompromiseEmail Takeover', 1: 'DebitCredit Card FraudSim Swap Fraud', 2: 'DematDepository Fraud', 3: 'EWallet Related Fraud', 4: 'Fraud CallVishing', 5: 'Internet Banking Related Fraud', 6: 'UPI Related Frauds'}


In [2]:
import numpy as np

In [3]:
with open("/kaggle/input/pa-txt/pa", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs


ValueError: could not convert string to float: 'ਦੇ'