In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


data_train = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")


split_modus = data_train["Modus_Operandi"].str.split(expand=True)
split_modus = split_modus.rename(columns={i: f"m{i+1}" for i in range(10)})
split_modus = split_modus.fillna(0)
data_train = pd.concat([data_train, split_modus], axis=1)

features_to_keep = [
    "Status", "Victim_Sex", 'Part 1-2', 'Victim_Age', 'Premise_Code',
    'Weapon_Used_Code', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10'
]

X = data_train[features_to_keep]
y = data_train['Crime_Category']


le = LabelEncoder()
y = le.fit_transform(y)

categorical_features = ['Victim_Age', "Victim_Sex", "Status"]


numerical_features = [col for col in X.columns if col not in categorical_features]

numerical_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create a full pipeline with the preprocessor and a XGBClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])





In [2]:
param_dist = {
    'classifier__n_estimators': [275, 300, 325, 350, 375],
    'classifier__max_depth': [8, 9, 10, 11],
    'classifier__subsample': [0.85, 0.87, 0.89, 0.91, 0.93],
    'classifier__colsample_bytree': [0.23, 0.25, 0.27, 0.29, 0.31],
    'classifier__gamma': [0.18, 0.2, 0.22, 0.24, 0.26],
    'classifier__learning_rate': [0.13, 0.15, 0.17, 0.19, 0.21],
    'classifier__min_child_weight': [1, 2, 3],
    'classifier__reg_alpha': [0, 0.05, 0.1, 0.15],
    'classifier__reg_lambda': [0.8, 0.9, 1, 1.1, 1.2]
}


from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=300, cv=cv,
    scoring='accuracy', n_jobs=-1, random_state=42, verbose=1
)
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)


best_model = random_search.best_estimator_


data_test = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

split_modus = data_test["Modus_Operandi"].str.split(expand=True)
split_modus = split_modus.rename(columns={i: f"m{i+1}" for i in range(10)})
split_modus = split_modus.fillna(0)
data_test = pd.concat([data_test, split_modus], axis=1)

data_test = data_test[features_to_keep]

test_predictions = le.inverse_transform(best_model.predict(data_test))

Fitting 15 folds for each of 300 candidates, totalling 4500 fits
Best parameters: {'classifier__subsample': 0.93, 'classifier__reg_lambda': 0.8, 'classifier__reg_alpha': 0, 'classifier__n_estimators': 350, 'classifier__min_child_weight': 2, 'classifier__max_depth': 8, 'classifier__learning_rate': 0.13, 'classifier__gamma': 0.18, 'classifier__colsample_bytree': 0.27}
Best cross-validation score: 0.9469666666666668


In [3]:
submission = pd.DataFrame({
    'ID': range(1, len(test_predictions) + 1),
    'Crime_Category': test_predictions
})
submission.to_csv('submissionnn2.csv', index=False)