In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv(r"C:\Users\Ronak Gohil\Desktop\Hakathon\updatedd_data_updated.csv")

In [4]:
# Assuming df is your DataFrame with 'processed_text' and 'sub_category' columns
X = data['crimeaditionalinfo']
y_sub = data['sub_category']

# Encode the subcategory labels
le = LabelEncoder()
y_sub_encoded = le.fit_transform(y_sub)

# Split data into training and testing sets
X_train, X_test, y_train_sub, y_test_sub = train_test_split(X, y_sub_encoded, test_size=0.2, random_state=42)

# Train the subcategory classifier
vectorizer = TfidfVectorizer()
X_tfidf_train = vectorizer.fit_transform(X_train)
X_tfidf_test = vectorizer.transform(X_test)

# Use XGBoost for the subcategory classifier
xgb_sub = XGBClassifier(random_state=42)
xgb_sub.fit(X_tfidf_train, y_train_sub)

# Predict subcategories for the test set
y_pred_sub = xgb_sub.predict(X_tfidf_test)

# Evaluate subcategory classification performance
sub_accuracy = accuracy_score(y_test_sub, y_pred_sub)
sub_f1 = f1_score(y_test_sub, y_pred_sub, average='macro')

print(f"Subcategory Accuracy: {sub_accuracy:.2f}")
print(f"Subcategory F1-score: {sub_f1:.2f}")

# If you need to convert the numerical predictions back to string labels
y_pred_sub_labels = le.inverse_transform(y_pred_sub)

# Now, use the subcategory predictions to classify into the higher-level categories
# This can be done by training a separate category-level classifier or using a rule-based approach

Subcategory Accuracy: 0.92
Subcategory F1-score: 0.93


In [5]:
import joblib

In [7]:
joblib.dump(xgb_sub, "xgb_model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(le, "le.joblib")

['le.joblib']