Problem Statement : Development of an NLP Model for Text Analytics and
Classification
• Objective:
o To develop an NLP model that categorizes complaint based on victim, type 
of fraud and other relevant parameters used for text classification and 
preparing the final model

In [1]:
import nltk
from nltk.probability import FreqDist
import pandas as pd
from collections import defaultdict, Counter
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
train_fraud=pd.read_csv("train.csv")
test_fraud=pd.read_csv("test.csv")

In [2]:
train_fraud['crimeaditionalinfo']=train_fraud['crimeaditionalinfo'].str.lower()

In [3]:
train_fraud= train_fraud.dropna()

In [4]:
# Apply the function to each row in the 'crimeaditionalinfo' column
train_fraud['word_frequencies'] = train_fraud['crimeaditionalinfo'].dropna().apply((lambda x: x.split()))

print(train_fraud[['crimeaditionalinfo', 'word_frequencies']])

                                      crimeaditionalinfo  \
0      i had continue received random calls and abusi...   
1      the above fraudster is continuously messaging ...   
2      he is acting like a police and demanding for m...   
3      in apna job i have applied for job interview f...   
4      i received a call from lady stating that she w...   
...                                                  ...   
93681  identity theft   smishing sms fraud  creditdeb...   
93682  received call from  number asking about phone ...   
93683  cyber stalking   blackmailing   phonesmsvoip c...   
93684  call karke bola ki aapka lotary laga ha aru ac...   
93685  there is app name koko loan app they send the ...   

                                        word_frequencies  
0      [i, had, continue, received, random, calls, an...  
1      [the, above, fraudster, is, continuously, mess...  
2      [he, is, acting, like, a, police, and, demandi...  
3      [in, apna, job, i, have, applied, fo

In [5]:
train_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo,word_frequencies
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[i, had, continue, received, random, calls, an..."
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[the, above, fraudster, is, continuously, mess..."
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[he, is, acting, like, a, police, and, demandi..."
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[in, apna, job, i, have, applied, for, job, in..."
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[i, received, a, call, from, lady, stating, th..."
...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, from, number, asking, about, ..."
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h..."


In [6]:
unique_words = train_fraud['category'].apply(lambda x: set(re.findall(r'\b\w+\b', x.lower())))
unique_words

0        {online, crime, social, related, and, media}
1                          {online, financial, fraud}
2                         {online, gambling, betting}
3        {online, crime, social, related, and, media}
4                          {online, financial, fraud}
                             ...                     
93681                      {online, financial, fraud}
93682                      {online, financial, fraud}
93683                      {online, financial, fraud}
93684    {online, crime, social, related, and, media}
93685                      {online, financial, fraud}
Name: category, Length: 87074, dtype: object

In [7]:
## import pandas as pd

crime_keywords = {
    "Women/Child-Related Crime": {
        "Child Pornography/Child Sexual Abuse Material (CSAM)": {
            "keywords": {"child pornography", "csam", "child exploitation", "child sexual abuse", "child porn","Online and social media related crime"}
        },
        "Rape/Gang Rape-Sexually Abusive Content": {
            "keywords": {"rape", "gang rape", "sexual abuse", "sexual assault", "forced sex"}
        },
        "Sale, Publishing, and Transmitting Obscene Material/Sexually Explicit Material": {
            "keywords": {"obscene material", "explicit content", "pornography", "sexually explicit", "adult content","Online and social media related crime"}
        }
    },
    "Financial Fraud Crimes": {
        "Debit/Credit Card Fraud": {
            "keywords": {"credit ","card fraud", "debit"," card fraud", "card scam", "card theft","online financial fraud"}
        },
        "SIM Swap Fraud": {
            "keywords": {"sim swap fraud", "sim hacking", "sim card fraud","sim"}
        },
        "Internet Banking-Related Fraud": {
            "keywords": {"internet banking fraud", "online banking fraud", "banking scam","bank","money","loan","lending"}
        },
        "Business Email Compromise/Email Takeover": {
            "keywords": {"email takeover", "email compromise", "business email hack","Online and social media related crime"}
        },
        "E-Wallet Related Frauds": {
            "keywords": {"e-wallet fraud", "digital wallet fraud", "mobile wallet scam"}
        },
        "Fraud Call/Vishing": {
            "keywords": {"vishing", "fraud call", "fake call", "phone scam"}
        },
        "Demat/Depository Fraud": {
            "keywords": {"demat fraud", "depository fraud", "investment account fraud","deposit","investment", "depository", "fraud"}
        },
        "UPI-Related Frauds": {
            "keywords": {"upi fraud", "upi scam", "online payment fraud"," online", "upi"}
        },
        "Aadhaar Enabled Payment System (AEPS) Fraud": {
            "keywords": {"aeps fraud", "aadhaar payment fraud", "aadhaar scam","aadhar"}
        }
    },
    "Cyber Crimes": {
        "Email Phishing": {
            "keywords": {"email phishing", "phishing scam", "phishing email", "email fraud", "Online and social media created crime","email"}
        },
        "Cheating by Impersonation": {
            "keywords": {"impersonation fraud", "fake identity", "identity impersonation", "Online and social media created crime"}
        },
        "Fake/Impersonating Profile": {
            "keywords": {"fake profile", "impersonation profile", "fake social profile","Online and social media created crime","fake"}
        },
        "Profile Hacking/Identity Theft": {
            "keywords": {"identity theft", "profile hacking", "identity fraud","fraudster"}
        },
        "Provocative Speech of Unlawful Acts": {
            "keywords": {"provocative speech", "unlawful acts", "incitement", "hate speech", "Online and social media created crime"}
        },
        "Intimidating Email": {
            "keywords": {"intimidating email", "threatening email", "email harassment"}
        },
        "Online Job Fraud": {
            "keywords": {"job fraud", "employment scam", "fake job offer","job", "Online and social media created crime"}
        },
        "Online Matrimonial Fraud": {
            "keywords": {"matrimonial fraud", "marriage scam", "dating scam","marriage"}
        },
        "Cyber Bullying/Stalking/Sexting": {
            "keywords": {"cyberbullying", "stalking", "sexting", "online harassment"}
        }
    },
    "Cyber Attacks": {
        "Defacement/Hacking": {
            "keywords": {"website defacement", "hacking", "web hacking", "site defacement","money","app"}
        },
        "Unauthorized Access/Data Breach": {
            "keywords": {"unauthorized access", "data breach", "data leak", "breach of privacy"}
        },
        "Ransomware": {
            "keywords": {"ransomware", "ransom virus", "ransom malware"}
        },
        "Cryptocurrency Crime": {
            "keywords": {"cryptocurrency scam", "bitcoin fraud", "crypto theft"}
        },
        "Cyber Terrorism": {
            "keywords": {"cyber terrorism", "cyber attack", "online terrorism"}
        }
    },
    "Network and Infrastructure Attacks": {
        "Denial of Service (DoS) and Distributed Denial of Service (DDoS)": {
            "keywords": {"dos attack", "ddos attack", "denial of service"}
        },
        "Attacks on Critical Infrastructure, SCADA, Operational Technology Systems": {
            "keywords": {"critical infrastructure attack", "scada attack", "ot systems hack"}
        }
    }
}

print(crime_keywords)




{'Women/Child-Related Crime': {'Child Pornography/Child Sexual Abuse Material (CSAM)': {'keywords': {'child exploitation', 'child sexual abuse', 'child pornography', 'Online and social media related crime', 'csam', 'child porn'}}, 'Rape/Gang Rape-Sexually Abusive Content': {'keywords': {'sexual assault', 'forced sex', 'rape', 'gang rape', 'sexual abuse'}}, 'Sale, Publishing, and Transmitting Obscene Material/Sexually Explicit Material': {'keywords': {'sexually explicit', 'adult content', 'explicit content', 'obscene material', 'Online and social media related crime', 'pornography'}}}, 'Financial Fraud Crimes': {'Debit/Credit Card Fraud': {'keywords': {'card fraud', 'debit', 'card scam', ' card fraud', 'card theft', 'credit ', 'online financial fraud'}}, 'SIM Swap Fraud': {'keywords': {'sim swap fraud', 'sim', 'sim card fraud', 'sim hacking'}}, 'Internet Banking-Related Fraud': {'keywords': {'banking scam', 'internet banking fraud', 'money', 'lending', 'loan', 'online banking fraud', 'b

In [8]:
# Define a function to classify the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')).union({"i", "have", "has", "the", "and","aapka","ki","apna"})
def split_and_filter(sentence):
    if isinstance(sentence, str):  # Ensure the input is a string
        # Tokenize, remove non-English characters, convert to lowercase
        words = re.findall(r'\b[a-z]+\b', sentence.lower())  # Only words with a-z characters
        # Filter out stopwords
        filtered_words = [word for word in words if word not in stop_words]
        
        matches = defaultdict(int)
        for category, subcategories in crime_keywords.items():
            for subcategory, keywords in subcategories.items():
                for word in filtered_words:
                    if word in keywords:
                        matches[(category, subcategory)] += 1  # Count occurrence in subcategory
                        
    
        return dict(matches), filtered_words
    else:
        # Return an empty dictionary if the sentence is not a string
        return {}, []

# Apply classification to each row in 'additional_info'
train_fraud[['frequency', 'filtered_words']] = train_fraud['crimeaditionalinfo'].apply(
    lambda x: pd.Series(split_and_filter(x))
)


In [9]:
train_fraud

Unnamed: 0,category,sub_category,crimeaditionalinfo,word_frequencies,frequency,filtered_words
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[i, had, continue, received, random, calls, an...",{},"[continue, received, random, calls, abusive, m..."
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[the, above, fraudster, is, continuously, mess...",{},"[fraudster, continuously, messaging, asking, p..."
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[he, is, acting, like, a, police, and, demandi...",{},"[acting, like, police, demanding, money, addin..."
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[in, apna, job, i, have, applied, for, job, in...",{},"[job, applied, job, interview, telecalling, re..."
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[i, received, a, call, from, lady, stating, th...",{},"[received, call, lady, stating, send, new, pho..."
...,...,...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit...",{},"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, from, number, asking, about, ...",{},"[received, call, number, asking, phone, pay, c..."
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ...",{},"[cyber, stalking, blackmailing, phonesmsvoip, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h...",{},"[call, karke, bola, lotary, laga, ha, aru, ac,..."


In [10]:
train_fraud.columns

Index(['category', 'sub_category', 'crimeaditionalinfo', 'word_frequencies',
       'frequency', 'filtered_words'],
      dtype='object')

In [11]:
train= train_fraud[['category', 'sub_category', 'crimeaditionalinfo',
        'filtered_words']]

In [12]:
train


Unnamed: 0,category,sub_category,crimeaditionalinfo,filtered_words
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,i had continue received random calls and abusi...,"[continue, received, random, calls, abusive, m..."
1,Online Financial Fraud,Fraud CallVishing,the above fraudster is continuously messaging ...,"[fraudster, continuously, messaging, asking, p..."
2,Online Gambling Betting,Online Gambling Betting,he is acting like a police and demanding for m...,"[acting, like, police, demanding, money, addin..."
3,Online and Social Media Related Crime,Online Job Fraud,in apna job i have applied for job interview f...,"[job, applied, job, interview, telecalling, re..."
4,Online Financial Fraud,Fraud CallVishing,i received a call from lady stating that she w...,"[received, call, lady, stating, send, new, pho..."
...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,received call from number asking about phone ...,"[received, call, number, asking, phone, pay, c..."
93683,Online Financial Fraud,UPI Related Frauds,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalking, blackmailing, phonesmsvoip, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, lotary, laga, ha, aru, ac,..."


In [None]:

train_fraud['filtered_words'] =  ' '.join(train_fraud['category'])
# Vectorize the processed text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_fraud['filtered_words'])

In [None]:

train_fraud['category'] = train_fraud['category'].astype('category')
train_fraud['category_encoded'] = train_fraud['category'].cat.codes
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, train_fraud['category_encoded'], test_size=0.3, random_state=42)

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)


# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
test_fraud= pd.read_csv('test.csv')

In [None]:
test_fraud

In [None]:
test_fraud.isna().sum()

In [None]:
test_fraud=test_fraud.dropna()

In [None]:
test_fraud

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Sample data loading (replace this with your dataset)
# Assuming 'fraud' is the target variable
# train_fraud = pd.read_csv('your_fraud_data.csv')
X_fraud = train_fraud.drop('filtered_words', axis=1)  # Replace 'fraud' with your target column
y_fraud = train_fraud['filtered_words']

# Example data for demonstration
from sklearn.datasets import make_classification
X_fraud, y_fraud = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=0)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier()
}

# Train and evaluate each model
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    # Store results
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    })

# Convert results to DataFrame for easy comparison
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="F1 Score", ascending=False))


In [None]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return list(synonyms)

# Expand each keyword with synonyms
for category, subcats in categories.items():
    for subcategory, keywords in subcats.items():
        synonyms = set()
        for keyword in keywords:
            for word in keyword.split():
                synonyms.update(get_synonyms(word))
        categories[category][subcategory].extend(synonyms)

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabet characters
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return tokens

# Function to classify text based on keywords
def classify_text(text):
    tokens = preprocess_text(text)
    matches = defaultdict(list)
    
    # Search for category and subcategory keywords in tokens
    for category, subcats in categories.items():
        for subcategory, keywords in subcats.items():
            if any(keyword in tokens for keyword in keywords):
                matches[category].append(subcategory)
    
    return dict(matches)

# Apply classification to each description
train_fraud['classification'] = train_fraud['crime_keywords'].apply(classify_text)
print(train_fraud[['crime_keywords', 'classification']])