Problem Statement : Development of an NLP Model for Text Analytics and
Classification
• Objective:
o To develop an NLP model that categorizes complaint based on victim, type 
of fraud and other relevant parameters used for text classification and 
preparing the final model

In [1]:
#Download the required libraries
import nltk
from nltk.probability import FreqDist
import pandas as pd
from collections import defaultdict, Counter
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
train_fraud=pd.read_csv("train.csv")
test_fraud=pd.read_csv("test.csv")

In [2]:
#Convert additionalinformation columns each word small.
train_fraud['crimeaditionalinfo']=train_fraud['crimeaditionalinfo'].str.lower()

In [3]:
#Drop the nan or missing words
train_fraud= train_fraud.dropna()

In [4]:
# Apply the function to each row in the 'crimeaditionalinfo' column
#split it into words or tokens
train_fraud['word_frequencies'] = train_fraud['crimeaditionalinfo'].dropna().apply((lambda x: x.split()))

print(train_fraud[['crimeaditionalinfo', 'word_frequencies']])

                                      crimeaditionalinfo  \
0      i had continue received random calls and abusi...   
1      the above fraudster is continuously messaging ...   
2      he is acting like a police and demanding for m...   
3      in apna job i have applied for job interview f...   
4      i received a call from lady stating that she w...   
...                                                  ...   
93681  identity theft   smishing sms fraud  creditdeb...   
93682  received call from  number asking about phone ...   
93683  cyber stalking   blackmailing   phonesmsvoip c...   
93684  call karke bola ki aapka lotary laga ha aru ac...   
93685  there is app name koko loan app they send the ...   

                                        word_frequencies  
0      [i, had, continue, received, random, calls, an...  
1      [the, above, fraudster, is, continuously, mess...  
2      [he, is, acting, like, a, police, and, demandi...  
3      [in, apna, job, i, have, applied, fo

In [5]:
train_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo,word_frequencies
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[i, had, continue, received, random, calls, an..."
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[the, above, fraudster, is, continuously, mess..."
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[he, is, acting, like, a, police, and, demandi..."
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[in, apna, job, i, have, applied, for, job, in..."
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[i, received, a, call, from, lady, stating, th..."
...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, from, number, asking, about, ..."
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h..."


In [6]:

#Find the unique words in category.
unique_words = train_fraud['category'].apply(lambda x: set(re.findall(r'\b\w+\b', x.lower())))
unique_words

0        {media, social, related, crime, and, online}
1                          {fraud, financial, online}
2                         {gambling, online, betting}
3        {media, social, related, crime, and, online}
4                          {fraud, financial, online}
                             ...                     
93681                      {fraud, financial, online}
93682                      {fraud, financial, online}
93683                      {fraud, financial, online}
93684    {media, social, related, crime, and, online}
93685                      {fraud, financial, online}
Name: category, Length: 87074, dtype: object

In [7]:
## additional category and subcategory
crime_keywords = {
    "Women/Child-Related Crime": {
        "Child Pornography/Child Sexual Abuse Material (CSAM)": {
            "keywords": {"child pornography", "csam", "child exploitation", "child sexual abuse", "child porn","Online and social media related crime"}
        },
        "Rape/Gang Rape-Sexually Abusive Content": {
            "keywords": {"rape", "gang rape", "sexual abuse", "sexual assault", "forced sex"}
        },
        "Sale, Publishing, and Transmitting Obscene Material/Sexually Explicit Material": {
            "keywords": {"obscene material", "explicit content", "pornography", "sexually explicit", "adult content","Online and social media related crime"}
        }
    },
    "Financial Fraud Crimes": {
        "Debit/Credit Card Fraud": {
            "keywords": {"credit ","card fraud", "debit"," card fraud", "card scam", "card theft","online financial fraud"}
        },
        "SIM Swap Fraud": {
            "keywords": {"sim swap fraud", "sim hacking", "sim card fraud","sim"}
        },
        "Internet Banking-Related Fraud": {
            "keywords": {"internet banking fraud", "online banking fraud", "banking scam","bank","money","loan","lending"}
        },
        "Business Email Compromise/Email Takeover": {
            "keywords": {"email takeover", "email compromise", "business email hack","Online and social media related crime"}
        },
        "E-Wallet Related Frauds": {
            "keywords": {"e-wallet fraud", "digital wallet fraud", "mobile wallet scam"}
        },
        "Fraud Call/Vishing": {
            "keywords": {"vishing", "fraud call", "fake call", "phone scam"}
        },
        "Demat/Depository Fraud": {
            "keywords": {"demat fraud", "depository fraud", "investment account fraud","deposit","investment", "depository", "fraud"}
        },
        "UPI-Related Frauds": {
            "keywords": {"upi fraud", "upi scam", "online payment fraud"," online", "upi"}
        },
        "Aadhaar Enabled Payment System (AEPS) Fraud": {
            "keywords": {"aeps fraud", "aadhaar payment fraud", "aadhaar scam","aadhar"}
        }
    },
    "Cyber Crimes": {
        "Email Phishing": {
            "keywords": {"email phishing", "phishing scam", "phishing email", "email fraud", "Online and social media created crime","email"}
        },
        "Cheating by Impersonation": {
            "keywords": {"impersonation fraud", "fake identity", "identity impersonation", "Online and social media created crime"}
        },
        "Fake/Impersonating Profile": {
            "keywords": {"fake profile", "impersonation profile", "fake social profile","Online and social media created crime","fake"}
        },
        "Profile Hacking/Identity Theft": {
            "keywords": {"identity theft", "profile hacking", "identity fraud","fraudster"}
        },
        "Provocative Speech of Unlawful Acts": {
            "keywords": {"provocative speech", "unlawful acts", "incitement", "hate speech", "Online and social media created crime"}
        },
        "Intimidating Email": {
            "keywords": {"intimidating email", "threatening email", "email harassment"}
        },
        "Online Job Fraud": {
            "keywords": {"job fraud", "employment scam", "fake job offer","job", "Online and social media created crime"}
        },
        "Online Matrimonial Fraud": {
            "keywords": {"matrimonial fraud", "marriage scam", "dating scam","marriage"}
        },
        "Cyber Bullying/Stalking/Sexting": {
            "keywords": {"cyberbullying", "stalking", "sexting", "online harassment"}
        }
    },
    "Cyber Attacks": {
        "Defacement/Hacking": {
            "keywords": {"website defacement", "hacking", "web hacking", "site defacement","money","app"}
        },
        "Unauthorized Access/Data Breach": {
            "keywords": {"unauthorized access", "data breach", "data leak", "breach of privacy"}
        },
        "Ransomware": {
            "keywords": {"ransomware", "ransom virus", "ransom malware"}
        },
        "Cryptocurrency Crime": {
            "keywords": {"cryptocurrency scam", "bitcoin fraud", "crypto theft"}
        },
        "Cyber Terrorism": {
            "keywords": {"cyber terrorism", "cyber attack", "online terrorism"}
        }
    },
    "Network and Infrastructure Attacks": {
        "Denial of Service (DoS) and Distributed Denial of Service (DDoS)": {
            "keywords": {"dos attack", "ddos attack", "denial of service"}
        },
        "Attacks on Critical Infrastructure, SCADA, Operational Technology Systems": {
            "keywords": {"critical infrastructure attack", "scada attack", "ot systems hack"}
        }
    }
}

print(crime_keywords)




{'Women/Child-Related Crime': {'Child Pornography/Child Sexual Abuse Material (CSAM)': {'keywords': {'Online and social media related crime', 'csam', 'child pornography', 'child exploitation', 'child porn', 'child sexual abuse'}}, 'Rape/Gang Rape-Sexually Abusive Content': {'keywords': {'gang rape', 'sexual assault', 'rape', 'forced sex', 'sexual abuse'}}, 'Sale, Publishing, and Transmitting Obscene Material/Sexually Explicit Material': {'keywords': {'adult content', 'Online and social media related crime', 'obscene material', 'sexually explicit', 'explicit content', 'pornography'}}}, 'Financial Fraud Crimes': {'Debit/Credit Card Fraud': {'keywords': {'card fraud', 'debit', ' card fraud', 'credit ', 'online financial fraud', 'card theft', 'card scam'}}, 'SIM Swap Fraud': {'keywords': {'sim hacking', 'sim', 'sim card fraud', 'sim swap fraud'}}, 'Internet Banking-Related Fraud': {'keywords': {'lending', 'online banking fraud', 'bank', 'banking scam', 'money', 'loan', 'internet banking fr

In [8]:

# Define a function to classify the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')).union({"i", "have", "has", "the", "and","aapka","ki","apna"})
def split_and_filter(sentence):
    if isinstance(sentence, str):  # Ensure the input is a string
        # Tokenize, remove non-English characters, convert to lowercase
        words = re.findall(r'\b[a-z]+\b', sentence.lower())  # Only words with a-z characters
        # Filter out stopwords
        filtered_words = [word for word in words if word not in stop_words]
        
        matches = defaultdict(int)
        for category, subcategories in crime_keywords.items():
            for subcategory, keywords in subcategories.items():
                for word in filtered_words:
                    if word in keywords:
                        matches[(category, subcategory)] += 1  # Count occurrence in subcategory
                        
    
        return dict(matches), filtered_words
    else:
        # Return an empty dictionary if the sentence is not a string
        return {}, []

# Apply classification to each row in 'additional_info'
train_fraud[['frequency', 'filtered_words']] = train_fraud['crimeaditionalinfo'].apply(
    lambda x: pd.Series(split_and_filter(x))
)


In [9]:
train_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo,word_frequencies,frequency,filtered_words
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[i, had, continue, received, random, calls, an...",{},"[continue, received, random, calls, abusive, m..."
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[the, above, fraudster, is, continuously, mess...",{},"[fraudster, continuously, messaging, asking, p..."
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[he, is, acting, like, a, police, and, demandi...",{},"[acting, like, police, demanding, money, addin..."
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[in, apna, job, i, have, applied, for, job, in...",{},"[job, applied, job, interview, telecalling, re..."
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[i, received, a, call, from, lady, stating, th...",{},"[received, call, lady, stating, send, new, pho..."
...,...,...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit...",{},"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, from, number, asking, about, ...",{},"[received, call, number, asking, phone, pay, c..."
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ...",{},"[cyber, stalking, blackmailing, phonesmsvoip, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h...",{},"[call, karke, bola, lotary, laga, ha, aru, ac,..."


In [10]:
train_fraud.columns

Index(['category', 'sub_category', 'crimeaditionalinfo', 'word_frequencies',
       'frequency', 'filtered_words'],
      dtype='object')

In [11]:
train= train_fraud[['category', 'sub_category', 'crimeaditionalinfo',
        'filtered_words']]

In [12]:

#Another approach to get tokens or words by using vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
train_fraud['crimeaditionalinfo']=  train_fraud['crimeaditionalinfo'].fillna(' ').astype(str)
def preprocess_text(text):
   if isinstance(text, str):
       
        # Apply some text processing, such as removing non-alphabet characters
        text= re.sub(r'[^a-zA-Z\s]', '', text.lower())
    
    # Tokenize, remove stop words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    # Join back to string
        return ' '.join(tokens)

# Apply preprocessing
train_fraud['processed_description'] = train_fraud['crimeaditionalinfo'].apply(preprocess_text)
print(train_fraud[['crimeaditionalinfo', 'processed_description']])
# Vectorize the processed text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_fraud['processed_description'])

                                      crimeaditionalinfo  \
0      i had continue received random calls and abusi...   
1      the above fraudster is continuously messaging ...   
2      he is acting like a police and demanding for m...   
3      in apna job i have applied for job interview f...   
4      i received a call from lady stating that she w...   
...                                                  ...   
93681  identity theft   smishing sms fraud  creditdeb...   
93682  received call from  number asking about phone ...   
93683  cyber stalking   blackmailing   phonesmsvoip c...   
93684  call karke bola ki aapka lotary laga ha aru ac...   
93685  there is app name koko loan app they send the ...   

                                   processed_description  
0      continue received random call abusive message ...  
1      fraudster continuously messaging asking pay mo...  
2      acting like police demanding money adding sect...  
3      apna job applied job interview telec

Filtered_words same as processed_description

In [13]:
import psutil
print(psutil.__version__)  # Check version
print(dir(psutil)) 

5.9.4
['ABOVE_NORMAL_PRIORITY_CLASS', 'AF_LINK', 'AIX', 'AccessDenied', 'BELOW_NORMAL_PRIORITY_CLASS', 'BSD', 'CONN_CLOSE', 'CONN_CLOSE_WAIT', 'CONN_CLOSING', 'CONN_DELETE_TCB', 'CONN_ESTABLISHED', 'CONN_FIN_WAIT1', 'CONN_FIN_WAIT2', 'CONN_LAST_ACK', 'CONN_LISTEN', 'CONN_NONE', 'CONN_SYN_RECV', 'CONN_SYN_SENT', 'CONN_TIME_WAIT', 'Error', 'FREEBSD', 'HIGH_PRIORITY_CLASS', 'IDLE_PRIORITY_CLASS', 'IOPRIO_HIGH', 'IOPRIO_LOW', 'IOPRIO_NORMAL', 'IOPRIO_VERYLOW', 'LINUX', 'MACOS', 'NETBSD', 'NIC_DUPLEX_FULL', 'NIC_DUPLEX_HALF', 'NIC_DUPLEX_UNKNOWN', 'NORMAL_PRIORITY_CLASS', 'NoSuchProcess', 'OPENBSD', 'OSX', 'POSIX', 'POWER_TIME_UNKNOWN', 'POWER_TIME_UNLIMITED', 'PermissionError', 'Popen', 'Process', 'ProcessLookupError', 'REALTIME_PRIORITY_CLASS', 'STATUS_DEAD', 'STATUS_DISK_SLEEP', 'STATUS_IDLE', 'STATUS_LOCKED', 'STATUS_PARKED', 'STATUS_RUNNING', 'STATUS_SLEEPING', 'STATUS_STOPPED', 'STATUS_TRACING_STOP', 'STATUS_WAITING', 'STATUS_WAKING', 'STATUS_ZOMBIE', 'SUNOS', 'TimeoutExpired', 'WINDO

In [14]:
test_fraud= pd.read_csv('test.csv')

In [15]:
test_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...
...,...,...,...
31224,Online and Social Media Related Crime,Online Matrimonial Fraud,A lady named Rashmi probably a fake name had c...
31225,Online Financial Fraud,Internet Banking Related Fraud,I am Mr Chokhe Ram Two pers mobile number wer...
31226,Any Other Cyber Crime,Other,Mai Bibekbraj maine pahle ki complain kar chuk...
31227,Online Financial Fraud,Internet Banking Related Fraud,received URL link for updating KYC from mobile...


In [16]:
test_fraud=test_fraud.fillna("")
train_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo,word_frequencies,frequency,filtered_words,processed_description
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[i, had, continue, received, random, calls, an...",{},"[continue, received, random, calls, abusive, m...",continue received random call abusive message ...
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[the, above, fraudster, is, continuously, mess...",{},"[fraudster, continuously, messaging, asking, p...",fraudster continuously messaging asking pay mo...
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[he, is, acting, like, a, police, and, demandi...",{},"[acting, like, police, demanding, money, addin...",acting like police demanding money adding sect...
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[in, apna, job, i, have, applied, for, job, in...",{},"[job, applied, job, interview, telecalling, re...",apna job applied job interview telecalling res...
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[i, received, a, call, from, lady, stating, th...",{},"[received, call, lady, stating, send, new, pho...",received call lady stating send new phone vivo...
...,...,...,...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit...",{},"[identity, theft, smishing, sms, fraud, credit...",identity theft smishing sm fraud creditdebit c...
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, from, number, asking, about, ...",{},"[received, call, number, asking, phone, pay, c...",received call number asking phone pay cash bac...
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ...",{},"[cyber, stalking, blackmailing, phonesmsvoip, ...",cyber stalking blackmailing phonesmsvoip call ...
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h...",{},"[call, karke, bola, lotary, laga, ha, aru, ac,...",call karke bola ki aapka lotary laga ha aru ac...


In [17]:
train_fraud= train_fraud[['category', 'sub_category', 'filtered_words']]

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Sample data loading (replace this with your dataset)
# Assuming 'fraud' is the target variable
# train_fraud = pd.read_csv('your_fraud_data.csv')
X_fraud = train_fraud.drop('filtered_words', axis=1)  # Replace 'fraud' with your target column
y_fraud = train_fraud['filtered_words']

# Example data for demonstration
from sklearn.datasets import make_classification
X_fraud, y_fraud = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=0)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier()
}

# Train and evaluate each model
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    # Store results
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    })

# Convert results to DataFrame for easy comparison
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="F1 Score", ascending=False))


                    Model  Accuracy  Precision    Recall  F1 Score   ROC AUC
2           Random Forest  0.906667   0.949640  0.862745  0.904110  0.964052
3       Gradient Boosting  0.880000   0.939850  0.816993  0.874126  0.961229
1           Decision Tree  0.860000   0.877551  0.843137  0.860000  0.860344
0     Logistic Regression  0.856667   0.871622  0.843137  0.857143  0.942110
4  Support Vector Machine  0.856667   0.898551  0.810458  0.852234  0.926748
6          Neural Network  0.846667   0.890511  0.797386  0.841379  0.939398
5     K-Nearest Neighbors  0.813333   0.881890  0.732026  0.800000  0.899093




In [19]:
#Second approach

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

train_fraud = train_fraud.copy()
train_fraud['category'] = train_fraud['category'].astype('category')
train_fraud['category_encoded'] = train_fraud['category'].cat.codes

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, train_fraud['category_encoded'], test_size=0.2, random_state=42)


# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7745621590582831
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.06      0.10      2206
           1       0.83      0.06      0.11        88
           2       1.00      1.00      1.00       702
           3       0.50      0.02      0.04        43
           4       1.00      0.03      0.06       347
           5       0.00      0.00      0.00        36
           6       0.76      0.99      0.86     11528
           7       0.00      0.00      0.00        82
           8       0.77      0.51      0.61      2374
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00         1

    accuracy                           0.77     17415
   macro avg       0.51      0.24      0.25     17415
weighted avg       0.78      0.77      0.71     17415



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
y_pred

array([6, 6, 6, ..., 6, 6, 6], dtype=int8)