In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv(r"C:\Users\amit\resize_data.csv")

In [4]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_text(text):
    return word_tokenize(text)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_stopwords_and_lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

In [6]:
# Assuming your dataset is in a DataFrame called 'df' with a column 'text_description'
df['cleaned_text'] = df['crimeaditionalinfo'].apply(clean_text)
df['tokens'] = df['cleaned_text'].apply(tokenize_text)
df['processed_tokens'] = df['tokens'].apply(remove_stopwords_and_lemmatize)

# Join tokens back into a single string
df['processed_text'] = df['processed_tokens'].apply(lambda x: ' '.join(x))

In [7]:
print(df[['crimeaditionalinfo', 'processed_text']].head())

                                  crimeaditionalinfo  \
0  The issue actually started when I got this ema...   
1  The lady in the attached videos goes by the na...   
2  This all happened a few days after I accidenta...   
3  i am thiyagaraj I have issue with my facebook ...   
4  My email got hacked without any clue I didnt c...   

                                      processed_text  
0  issue actually started got email first glance ...  
1  lady attached video go name swathi iyer social...  
2  happened day accidentally clicked strange link...  
3  thiyagaraj issue facebook today using facebook...  
4  email got hacked without clue didnt click down...  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50000)
X_tfidf = vectorizer.fit_transform(df['processed_text'])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame with 'processed_text' and 'sub_category' columns
X = df['processed_text']
y_sub = df['sub_category']

# Encode the subcategory labels
le = LabelEncoder()
y_sub_encoded = le.fit_transform(y_sub)

# Split data into training and testing sets
X_train, X_test, y_train_sub, y_test_sub = train_test_split(X, y_sub_encoded, test_size=0.2, random_state=42)

# Train the subcategory classifier
vectorizer = TfidfVectorizer()
X_tfidf_train = vectorizer.fit_transform(X_train)
X_tfidf_test = vectorizer.transform(X_test)

# Use XGBoost for the subcategory classifier
xgb_sub = XGBClassifier(random_state=42)
xgb_sub.fit(X_tfidf_train, y_train_sub)

# Predict subcategories for the test set
y_pred_sub = xgb_sub.predict(X_tfidf_test)

# Evaluate subcategory classification performance
sub_accuracy = accuracy_score(y_test_sub, y_pred_sub)
sub_f1 = f1_score(y_test_sub, y_pred_sub, average='macro')

print(f"Subcategory Accuracy: {sub_accuracy:.2f}")
print(f"Subcategory F1-score: {sub_f1:.2f}")

# If you need to convert the numerical predictions back to string labels
y_pred_sub_labels = le.inverse_transform(y_pred_sub)

# Now, use the subcategory predictions to classify into the higher-level categories
# This can be done by training a separate category-level classifier or using a rule-based approach

Subcategory Accuracy: 0.85
Subcategory F1-score: 0.82


In [12]:
import joblib

# Save the subcategory classifier
joblib.dump(xgb_sub, 'subcategory_classifier.joblib')

# Save the TfidfVectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Save the LabelEncoder
joblib.dump(le, 'label_encoder.joblib')

['label_encoder.joblib']

#category classifications

In [14]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
from scipy.sparse import hstack

# Assuming 'df' is your DataFrame and it has a 'category' column
y_category = df['category']

# Encode the category labels
le_category = LabelEncoder()
y_category_encoded = le_category.fit_transform(y_category)

# Split the data for category-level classification
_, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y_category_encoded, test_size=0.2, random_state=42)

# Get subcategory predictions for the entire dataset
X_tfidf_full = vectorizer.transform(X)
subcategory_predictions = xgb_sub.predict(X_tfidf_full)

# Combine TF-IDF features with subcategory predictions
X_combined = hstack((X_tfidf_full, subcategory_predictions.reshape(-1, 1)))

# Split the combined features
X_train_combined, X_test_combined = train_test_split(X_combined, test_size=0.2, random_state=42)

# Train the category-level classifier
rf_category = RandomForestClassifier(random_state=42)
rf_category.fit(X_train_combined, y_train_cat)

# Predict categories for the test set
y_pred_cat = rf_category.predict(X_test_combined)

# Evaluate category classification performance
cat_accuracy = accuracy_score(y_test_cat, y_pred_cat)
cat_f1 = f1_score(y_test_cat, y_pred_cat, average='weighted')

print(f"Category Accuracy: {cat_accuracy:.2f}")
print(f"Category F1-score: {cat_f1:.2f}")

# Save the category-level classifier
joblib.dump(rf_category, 'category_classifier.joblib')

# Save the category label encoder
joblib.dump(le_category, 'category_label_encoder.joblib')

# Function to predict both subcategory and category for new text
def predict_categories(text):
    # Preprocess and vectorize the text
    X_new = vectorizer.transform([text])
    
    # Predict subcategory
    subcategory_pred = xgb_sub.predict(X_new)
    subcategory_label = le.inverse_transform(subcategory_pred)[0]
    
    # Combine features for category prediction
    X_combined_new = hstack((X_new, subcategory_pred.reshape(-1, 1)))
    
    # Predict category
    category_pred = rf_category.predict(X_combined_new)
    category_label = le_category.inverse_transform(category_pred)[0]
    
    return subcategory_label, category_label

# Test the prediction function
test_text = "This is a test text for prediction"
sub_cat, main_cat = predict_categories(test_text)
print(f"Predicted Subcategory: {sub_cat}")
print(f"Predicted Main Category: {main_cat}")

Category Accuracy: 0.96
Category F1-score: 0.96
Predicted Subcategory: Sexually Explicit Act
Predicted Main Category: Financial Fraud Crimes


$ load the pretrained model and do predection

In [36]:
import joblib
import numpy as np
from scipy.sparse import hstack

# Load the saved models and encoders
xgb_sub = joblib.load('subcategory_classifier.joblib')
rf_category = joblib.load('category_classifier.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')
le = joblib.load('label_encoder.joblib')
le_category = joblib.load('category_label_encoder.joblib')

def predict_categories(text):
    """
    Predict the subcategory and main category for a given text.
    
    Args:
        text (str): The input text to be classified.
    
    Returns:
        Tuple[str, str]: The predicted subcategory and main category labels.
    """
    # Preprocess and vectorize the text
    X_new = vectorizer.transform([text])
    
    # Predict subcategory
    subcategory_pred = xgb_sub.predict(X_new)
    subcategory_label = le.inverse_transform(subcategory_pred)[0]
    
    # Combine features for category prediction
    X_combined_new = hstack((X_new, subcategory_pred.reshape(-1, 1)))
    
    # Predict category
    category_pred = rf_category.predict(X_combined_new)
    category_label = le_category.inverse_transform(category_pred)[0]
    
    return subcategory_label, category_label

# Example usage
new_text = "respected sir serious matter want inform person running involve shamefull activity using woman many place kolkata gariahat ballygunge ruby hospital behind area kasba anandapur quest mall metropolis mall area acropolismall area park circus forum mall elgin rd parkstreet maidan area esplanade garia jadavpur behalasakherbazar joka bansdroni rabindrasadanexide area haridevpur triangular park lake mall area rashbehari area tollygunge thakurpukur bbd bag dalhousie area saltlake name subhro saha amlan datta izaz ahmed anirban officially work insurance office hdfclife hindustanpark ab gariahat shopping mall st floor beside reliance trend building near gariahat outside several place involve shamefull activity last year sir person subhro saha forcefully involve woman employee ex female employee shamefull afternoon evening evening night time shamefull activity many innocent people trapped suffering dirty issue long time even lodge complain threatening blackmailing issue address place mobile whatsapp easily trace track activity need investigate last year detail please help"

subcategory, main_category = predict_categories(new_text)
print(f"Predicted Subcategory: {subcategory}")
print(f"Predicted Main Category: {main_category}")

Predicted Subcategory: RapeGang Rape RGRSexually Abusive Content
Predicted Main Category: Women/Child Related Crime
