In [1]:

import xgboost
import joblib
import pandas as pd
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load saved subcategory components
xgb_sub = joblib.load("xgb_model.joblib")
vectorizer = joblib.load("vectorizer.joblib")
le_sub = joblib.load("le.joblib")

data = pd.read_csv("abbreviated_file.csv")
# Assuming 'data' has 'cleaned_text' and 'category' columns
X = data['crimeaditionalinfo']
y_category = data['category']

# Encode the category labels
le_category = LabelEncoder()
y_category_encoded = le_category.fit_transform(y_category)

# Transform text using the loaded TfidfVectorizer
X_tfidf_full = vectorizer.transform(X)

# Get subcategory predictions for the entire dataset
subcategory_predictions = xgb_sub.predict(X_tfidf_full)

# Combine TF-IDF features with subcategory predictions for main category classification
X_combined = hstack((X_tfidf_full, subcategory_predictions.reshape(-1, 1)))

# Split data for training and testing
X_train_combined, X_test_combined, y_train_cat, y_test_cat = train_test_split(X_combined, y_category_encoded, test_size=0.2, random_state=42)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Build a pipeline to scale the data and fit the Logistic Regression model
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # Set with_mean=False for sparse matrices
    ('log_reg', LogisticRegression(max_iter=1200, solver='saga', random_state=42))
])

# Train the pipeline on the training set
pipeline.fit(X_train_combined, y_train_cat)

# Predict categories for the test set
y_pred_cat = pipeline.predict(X_test_combined)

# Evaluate main category classification performance
cat_accuracy = accuracy_score(y_test_cat, y_pred_cat)
cat_f1 = f1_score(y_test_cat, y_pred_cat, average='weighted')

print(f"Category Accuracy: {cat_accuracy:.2f}")
print(f"Category F1-score: {cat_f1:.2f}")
print(classification_report(y_test_cat, y_pred_cat, target_names=le_category.classes_))




Category Accuracy: 0.98
Category F1-score: 0.98
                           precision    recall  f1-score   support

   Financial Fraud Crimes       0.98      0.98      0.98     47530
        Other Cyber Crime       0.99      0.99      0.99    113173
Women/Child Related Crime       0.98      0.97      0.98     19901

                 accuracy                           0.98    180604
                macro avg       0.98      0.98      0.98    180604
             weighted avg       0.98      0.98      0.98    180604



In [5]:
 # Save the trained pipeline
joblib.dump(pipeline, "log_reg_pipeline_model.joblib")
joblib.dump(le_category,"le_cat.joblib")
print("Model pipeline saved as 'log_reg_pipeline_model.joblib'")


Model pipeline saved as 'log_reg_pipeline_model.joblib'


In [None]:
new_text = 
