In [59]:
import pandas as pd
import numpy as np
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.preprocessing import LabelEncoder

In [None]:
train_data = pd.read_csv("train.csv")

# x_train = train_data.drop('Crime_Category',axis = 1).to_numpy()
# y_train = train_data['Crime_Category'].to_numpy()

test_data = pd.read_csv("test.csv")

In [49]:
text_cols = ['Location', 'Modus_Operandi', 'Premise_Description', 'Weapon_Description']
num_cols = ['Latitude', 'Longitude', 'Victim_Age']
target_col = 'Crime_Category'

In [67]:
max_length = 100
word_count = 1000

tokenizers = {}
padded_texts = []

for column in text_cols:
    train_data[column].fillna('', inplace=True)
    test_data[column].fillna('', inplace=True)

    tokenizer = Tokenizer(num_words=word_count)
    tokenizer.fit_on_texts(train_data[column])
    tokenizers[column] = tokenizer
    padded_texts.append(
        pad_sequences(
            tokenizer.texts_to_sequences(
                train_data[column]
            ),
            maxlen = max_length
        ),
    )

In [64]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_data[target_col])
y_category = to_categorical(y_encoded)

In [77]:
len(label_encoder.classes_)

6

In [120]:
inputs = []
embeddings = []

for column in text_cols:
    input = keras.layers.Input(shape=(max_length,))
    inputs.append(input)

    embedding = keras.layers.Embedding(input_dim=word_count, output_dim=32)(input)
    flatten = keras.layers.Flatten()(embedding)
    embeddings.append(flatten)
input_num = keras.layers.Input(shape=(len(num_cols), ))
inputs.append(input_num)

concatenated = keras.layers.Concatenate()(embeddings + [input_num])
dense = keras.layers.Dense(64, activation='relu')(concatenated)
dense = keras.layers.Dense(64, activation='relu')(dense)
dense = keras.layers.Dense(128, activation='relu')(dense)
dense = keras.layers.Dense(128, activation='tanh')(dense)
dropout = keras.layers.Dropout(0.2)(dense)
output = keras.layers.Dense(len(label_encoder.classes_), activation='softmax')(dropout)

model = keras.Model(inputs=inputs, outputs=output)

In [121]:
model.summary()

In [122]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy', metrics=['accuracy']
)

model.fit(
    padded_texts + [train_data[num_cols]], y_category,
    epochs=20,
    batch_size=128,
    validation_split=0.15,
)

Epoch 1/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.6742 - loss: 0.9255 - val_accuracy: 0.8893 - val_loss: 0.3777
Epoch 2/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8947 - loss: 0.3604 - val_accuracy: 0.9137 - val_loss: 0.2984
Epoch 3/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9161 - loss: 0.2797 - val_accuracy: 0.9210 - val_loss: 0.2764
Epoch 4/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9350 - loss: 0.2144 - val_accuracy: 0.9270 - val_loss: 0.2711
Epoch 5/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9451 - loss: 0.1892 - val_accuracy: 0.9210 - val_loss: 0.2854
Epoch 6/20
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9509 - loss: 0.1663 - val_accuracy: 0.9227 - val_loss: 0.2804
Epoch 7/20
[1m133/133[0m 

<keras.src.callbacks.history.History at 0x78c4e0649930>

In [123]:
output_padded_texts = []
for column in text_cols:
    test_data[column].fillna('', inplace=True)
    output_padded_texts.append(
        pad_sequences(
            tokenizers[column].texts_to_sequences(
                test_data[column]
            ),
            maxlen = max_length
        ),
    )

In [124]:
predict = model.predict(output_padded_texts + [test_data[num_cols]])

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [125]:
output = pd.DataFrame(
    data = [
        [i + 1, predict[i].argmax()] for i in range(len(predict))
    ],
    columns = ["ID", "Crime_Category"]
)
output['Crime_Category'] = label_encoder.inverse_transform(output['Crime_Category'])

In [126]:
output.to_csv('output.csv', index=False)