<a href="https://colab.research.google.com/github/ashwinder1/ml-projects/blob/main/text_classification_hackathon_ver3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/IndiaAI-Hackathon/dataset/train.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read the csv file using pandas and see the header rows
df = pd.read_csv('/content/drive/MyDrive/IndiaAI-Hackathon/dataset/train.csv')
df.head()

### EDA

In [None]:
print(len(df['crimeaditionalinfo']))
print(len(df['sub_category']))

# 1. Text Preprocessing:

- convert to lowercase
- remove extra spaces
- remove Nan values
- convert to string

In [None]:
import pandas as pd

# ... (other imports and code)

cases_data = df['crimeaditionalinfo'] # get the description data or features

# Handle NaN values and convert to string
cases_data = cases_data.fillna("").astype(str)

# Convert to lowercase
cases_data = cases_data.str.lower()
# Remove extra spaces
cases_data = cases_data.str.split().str.join(' ') # Assign the result back to cases_data
cases_data.head()

print(len(cases_data))

93686


### Apply tokenisation

In [None]:
# install libraries for tokenisation
!pip install transformers

In [26]:
from transformers import AutoTokenizer
import pandas as pd

# Initialize the tokenizer once
tokenizer = AutoTokenizer.from_pretrained("obaidtambo/hinglish_bert_tokenizer")

# Apply tokenization
def apply_tokenisation(text):
    if isinstance(text, pd.Series):
        text = text.astype(str).tolist()  # Convert to a list of strings
    elif isinstance(text, str):
        text = [text]  # Convert single text to list for consistency

    # Perform tokenization in a single batch
    encoded_output = tokenizer(
        text,
        return_tensors=None,
        truncation=True,
        max_length=512,
        padding='max_length'
    )

    # Extract tokens and token IDs for all sentences in the batch
    all_token_ids = encoded_output['input_ids']
    return all_token_ids

# Tokenize the entire dataset as a batch
tokens = apply_tokenisation(cases_data)

# Convert to DataFrame if you want an easy view
tokens_df = pd.DataFrame(tokens)
print("Tokens DataFrame shape:", tokens_df.shape)


Tokens DataFrame shape: (93686, 512)


### Remove Stopwords: English and Hinglish

In [None]:
# Install used libraries
!pip install nltk

In [29]:
# import libraries for removing stopwords
import nltk
from nltk.corpus import stopwords

Import a file containing Hinglish stop words

In [28]:
import requests
from typing import Set, Optional
import pandas as pd

class HinglishStopwordsLoader:
    def __init__(self):
        self.github_raw_url = "https://raw.githubusercontent.com/TrigonaMinima/HinglishNLP/master/data/assets/stop_hinglish"
        self.stopwords: Set[str] = set()

    def load_stopwords(self, cache: bool = True) -> Set[str]:
        """
        Load Hinglish stopwords from GitHub

        Args:
            cache: Whether to cache the stopwords after first load

        Returns:
            Set of Hinglish stopwords
        """
        # Return cached stopwords if available
        if cache and self.stopwords:
            return self.stopwords

        try:
            # Fetch content from GitHub
            response = requests.get(self.github_raw_url)
            response.raise_for_status()  # Raise exception for bad status codes

            # Process the content
            stopwords = set(
                word.strip()
                for word in response.text.split('\n')
                if word.strip()  # Remove empty lines
            )

            if cache:
                self.stopwords = stopwords

            return stopwords

        except requests.RequestException as e:
            print(f"Error fetching stopwords: {e}")
            return set()

    def remove_stopwords(self, text: str) -> str:
        """
        Remove stopwords from given text

        Args:
            text: Input text

        Returns:
            Text with stopwords removed
        """
        if not self.stopwords:
            self.load_stopwords()

        words = text.lower().split()
        return ' '.join(word for word in words if word not in self.stopwords)

    def get_stopwords_df(self) -> pd.DataFrame:
        """
        Return stopwords as a DataFrame for easy viewing
        """
        stopwords = self.load_stopwords()
        return pd.DataFrame(sorted(list(stopwords)), columns=['Stopwords'])

In [40]:
# Download NLTK resources if not already downloaded
nltk.download('stopwords')

# Load Hinglish Stop words
loader = HinglishStopwordsLoader()

# Initialize stopwords
english_stopwords = set(stopwords.words("english"))
hindi_stopwords = loader.load_stopwords()
# custom_stopwords = {"maine", "ke", "par", "ka", "ki", "hai"}  # Add common Hinglish stopwords

# Merge stopwords into a single set
all_stopwords = english_stopwords | hindi_stopwords  # | custom_stopwords

# Function for removing stop words
def remove_stopwords(token_ids_list):
    # Decoding all tokens in batches
    decoded_tokens = [tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=True) for token_ids in token_ids_list]

    # Vectorized stopword filtering using a list comprehension
    filtered_token_ids = [
        tokenizer.convert_tokens_to_ids([token for token in tokens if token not in all_stopwords])
        for tokens in decoded_tokens
    ]
    return filtered_token_ids

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
# Preprocess and print result
filtered_tokens = remove_stopwords(tokens)

# Convert to DataFrame if needed
filtered_tokens_df = pd.DataFrame(filtered_tokens)
print("Filtered Tokens DataFrame shape:", filtered_tokens_df.shape)

Filtered Tokens DataFrame shape: (93686, 489)


# Step 2: Feature Extraction


For textual data, we will convert the data into numerical form and use TFIDF to extract the most important features

In [45]:
# Use TF-IDF to convert tokens into numerical features or matrices
from sklearn.feature_extraction.text import TfidfVectorizer

# Assume filtered_tokens is the output from the stopword removal process
# Step 1: Reconstruct the text from the filtered tokens
reconstructed_texts = [' '.join(tokenizer.convert_ids_to_tokens(token_ids)) for token_ids in filtered_tokens]

# Step 2: Use TF-IDF to convert the reconstructed texts into numerical features
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_features = tfidf_vectorizer.fit_transform(reconstructed_texts)

# Check the shape of the resulting TF-IDF features
print("TF-IDF Feature Matrix shape:", tfidf_features.shape)

TF-IDF Feature Matrix shape: (93686, 15999)


For labelled or categorical data, we will convert it into encodings or numerical values that our classifier can interpret

In [43]:
from sklearn.preprocessing import LabelEncoder
# Example categories
categories = df['sub_category']
print("Categories shape:", len(categories))
# ["UPI Related Frauds", "Internet Banking Related Fraud", "DebitCredit Card FraudSim Swap Fraud", "Cyber Bullying Stalking Sexting"]

# Label encoding the categories
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(categories)
print(encoded_labels.shape)

Categories shape: 93686
(93686,)


# Step 3: Model Training


To train the model, first we will split our textual and categorical data into training and testing data

In [46]:
from sklearn.model_selection import train_test_split
# Step 4: Split the data
# Assuming `tfidf_features` is the feature matrix from text and `encoded_labels` are the encoded categories
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, encoded_labels, test_size=0.2, random_state=42)

print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (74948, 15999) (74948,)
Test set shape: (18738, 15999) (18738,)


#### Initialize and Train the LinearSVC Model
Now that you have both text features and encoded labels, train the LinearSVC model.

In [47]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, KFold
# Initialize LinearSVC
clf = LinearSVC(C=0.1, max_iter=1000)

# Train the model
clf.fit(X_train, y_train)

# Test model accuracy
accuracy = clf.score(X_test, y_test)
print("Model accuracy:", accuracy)

kf = KFold(n_splits=4)
scores = cross_val_score(clf, tfidf_features, encoded_labels, cv=kf)
print("Cross-validated accuracy scores:", scores)
print("Average accuracy:", scores.mean())

Model accuracy: 0.5484043120930729
Cross-validated accuracy scores: [0.55046537 0.54427461 0.55142821 0.55194057]
Average accuracy: 0.5495271899151611


Example to predict the category of text with the model

In [54]:
# Sample text to classify
sample_text = """
I would like to complain against EASY BORROW app on Google Playstore
These apps collect data from their users phone like contacts photos etc and
then they call on those numbers for payment They use foul languages and harass them
This is a serious cfrime of stealing data from mobile phones and using it to bully the
users I request you to look into
this and take strict action against such apps and companies supporting them Thank You
"""

# Step 1: Preprocess the sample text
def preprocess_sample_text(text):
    # Tokenization (using the same tokenizer as before)
    encoded_output = tokenizer.encode_plus(
        text,
        return_tensors=None,
        truncation=True,
        max_length=512,
        padding='max_length'
    )
    token_ids = encoded_output.get('input_ids')

    # Remove stopwords from token_ids
    tokens = tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=True)
    filtered_tokens = [token for token in tokens if token not in all_stopwords]

    # Reconstruct the text for TF-IDF vectorization
    reconstructed_text = ' '.join(filtered_tokens)

    # Convert to TF-IDF features
    tfidf_vector = tfidf_vectorizer.transform([reconstructed_text])

    return tfidf_vector

# Step 2: Process the sample text and get TF-IDF features
sample_tfidf = preprocess_sample_text(sample_text)

# Step 3: Predict the category
predicted_label = clf.predict(sample_tfidf)

# Step 4: Decode the predicted label back to original category
predicted_category = label_encoder.inverse_transform(predicted_label)

print("Predicted Category:", predicted_category[0])

Predicted Category: Cyber Bullying  Stalking  Sexting
