In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Combine features
from scipy.sparse import hstack

[0 2 1]


In [11]:
data = {
    'text': ['Crime event 1', 'Crime event 2', 'Crime event 3'],
    'category': ['Type A', 'Type B', 'Type A'],
    'numerical': [10, 20, 15]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,category,numerical
0,Crime event 1,Type A,10
1,Crime event 2,Type B,20
2,Crime event 3,Type A,15


In [9]:
# Preprocess text data
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df['text'])

# Preprocess categorical data
encoder = OneHotEncoder()
category_features = encoder.fit_transform(df[['category']])

# Preprocess numerical data
scaler = StandardScaler()
numerical_features = scaler.fit_transform(df[['numerical']])


combined_features = hstack([text_features, category_features, numerical_features])

combined_features.toarray()

array([[ 0.70710678,  0.70710678,  1.        ,  0.        , -1.22474487],
       [ 0.70710678,  0.70710678,  0.        ,  1.        ,  1.22474487],
       [ 0.70710678,  0.70710678,  1.        ,  0.        ,  0.        ]])

In [10]:
# Compute similarities
similarities = cosine_similarity(combined_features)
similarities

array([[ 1.        , -0.14285714,  0.75592895],
       [-0.14285714,  1.        ,  0.37796447],
       [ 0.75592895,  0.37796447,  1.        ]])

In [12]:

# Retrieve top 10 most similar rows for a given query row
def get_top_similar_rows(similarities, query_index, top_n=10):
    similarity_scores = similarities[query_index]
    top_indices = np.argsort(-similarity_scores)[:top_n]
    return top_indices

# Example usage
query_index = 0  # Index of the query row
top_similar_rows = get_top_similar_rows(similarities, query_index)
print(top_similar_rows)

[0 2 1]


In [14]:
top_similar_df = df.iloc[top_similar_rows]
top_similar_df

Unnamed: 0,text,category,numerical
0,Crime event 1,Type A,10
2,Crime event 3,Type A,15
1,Crime event 2,Type B,20
