In [1]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class AdvancedCrimeEmbedding:

    def __init__(self, text_model='all-MiniLM-L6-v2'):
        """
        Advanced Crime Event Embedding Generator
        
        Args:
            text_model: Pretrained sentence transformer model
        """
        # Text embedding model
        self.text_embedding_model = SentenceTransformer(text_model)
        
        # Categorical feature encoder
        self.categorical_encoder = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore'), 
                    ['location_type', 'crime_type', 'time_of_day']),
                ('num', StandardScaler(), 
                    ['severity_score', 'economic_impact'])
            ])
    
    def generate_embedding(self, crime_event):
        """
        Generate comprehensive embedding for a crime event
        
        Args:
            crime_event: Dictionary or DataFrame row containing crime event details
        
        Returns:
            Comprehensive embedding combining text and categorical features
        """
        # Generate text embedding
        narrative_embedding = self._get_text_embedding(crime_event['narrative'])
        
        # Prepare categorical features
        categorical_features = crime_event[
            ['location_type', 'crime_type', 'time_of_day', 
             'severity_score', 'economic_impact']
        ].to_frame().T
        
        
        # Transform categorical features
        categorical_features_transformed = self.categorical_encoder.fit_transform(categorical_features)
        
        
        # Convert to torch tensor
        categorical_tensor = torch.tensor(
            categorical_features_transformed,#.toarray(), 
            dtype=torch.float32
        )
        
        # Combine text and categorical embeddings
        combined_embedding = torch.cat([
            torch.tensor(narrative_embedding, dtype=torch.float32),
            categorical_tensor.flatten()
        ])
        
        return combined_embedding
    
    def _get_text_embedding(self, text):
        """
        Generate embedding for textual narrative
        
        Args:
            text: Crime event narrative text
        
        Returns:
            Embedding vector for the text
        """
        return self.text_embedding_model.encode(text)
    
    def calculate_similarity(self, embedding1, embedding2, method='cosine'):
        """
        Calculate similarity between two event embeddings
        
        Args:
            embedding1: First event embedding
            embedding2: Second event embedding
            method: Similarity calculation method
        
        Returns:
            Similarity score
        """
        if method == 'cosine':
            return torch.nn.functional.cosine_similarity(
                embedding1.unsqueeze(0), 
                embedding2.unsqueeze(0)
            ).item()
        elif method == 'euclidean':
            return -torch.dist(embedding1, embedding2).item()
        else:
            raise ValueError("Unsupported similarity method")
    
    def find_similar_events(self, target_embedding, event_embeddings, top_n=5, threshold=0.7):
        """
        Find most similar events to a target event
        
        Args:
            target_embedding: Embedding of the target event
            event_embeddings: List of event embeddings to compare
            top_n: Number of top similar events to return
            threshold: Minimum similarity threshold
        
        Returns:
            List of most similar events with their similarity scores
        """
        # Calculate similarities
        similarities = [
            (i, self.calculate_similarity(target_embedding, emb)) 
            for i, emb in enumerate(event_embeddings)
        ]
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Filter by threshold and return top N
        return [
            (idx, sim) for idx, sim in similarities[:top_n] 
            if sim > threshold
        ]

def generate_synthetic_crime_dataset(n_samples=500):
    """
    Generate a synthetic crime dataset with rich narrative
    
    Args:
        n_samples: Number of crime events to generate
    
    Returns:
        Pandas DataFrame with detailed crime event features
    """
    np.random.seed(42)
    
    # Possible categorical values
    location_types = ['Residential', 'Commercial', 'Public Space', 'Industrial', 'Rural']
    crime_types = ['Theft', 'Assault', 'Burglary', 'Fraud', 'Vandalism']
    time_of_day = ['Morning', 'Afternoon', 'Evening', 'Night']
    
    # Generate synthetic narratives
    narratives = [
        f"A {np.random.choice(crime_types).lower()} occurred in a {np.random.choice(location_types).lower()} area. " +
        f"The incident took place during the {np.random.choice(time_of_day).lower()}. " +
        "Witnesses reported suspicious activity before the crime."
        for _ in range(n_samples)
    ]
    
    # Generate synthetic data
    data = {
        'location_type': np.random.choice(location_types, n_samples),
        'crime_type': np.random.choice(crime_types, n_samples),
        'time_of_day': np.random.choice(time_of_day, n_samples),
        'severity_score': np.random.uniform(1, 10, n_samples),
        'economic_impact': np.random.uniform(100, 10000, n_samples),
        'narrative': narratives
    }
    
    return pd.DataFrame(data)
    

In [2]:
crime_df = generate_synthetic_crime_dataset(n_samples=500)
crime_df

Unnamed: 0,location_type,crime_type,time_of_day,severity_score,economic_impact,narrative
0,Public Space,Fraud,Night,7.948112,6513.392715,A fraud occurred in a rural area. The incident...
1,Commercial,Assault,Evening,1.244507,1316.914717,A burglary occurred in a rural area. The incid...
2,Commercial,Assault,Night,1.586841,8897.724895,A assault occurred in a public space area. The...
3,Public Space,Fraud,Afternoon,5.175382,5080.531113,A burglary occurred in a public space area. Th...
4,Rural,Assault,Afternoon,9.182982,4548.562446,A vandalism occurred in a industrial area. The...
...,...,...,...,...,...,...
495,Public Space,Assault,Morning,3.034261,6016.915999,A assault occurred in a industrial area. The i...
496,Commercial,Assault,Morning,6.764284,8674.346773,A theft occurred in a industrial area. The inc...
497,Industrial,Assault,Night,9.810830,9472.663916,A theft occurred in a rural area. The incident...
498,Industrial,Vandalism,Afternoon,6.431438,1148.467464,A theft occurred in a rural area. The incident...


In [3]:
# Initialize advanced embedding analyzer
embedding_analyzer = AdvancedCrimeEmbedding()

# Generate embeddings for all events
event_embeddings = [
    embedding_analyzer.generate_embedding(row) 
    for _, row in crime_df.iterrows()
]

In [4]:
event_embeddings[0].shape

torch.Size([389])

In [5]:
# Select a target event for similarity analysis
target_event_idx = 42  # Example target event index
target_event = event_embeddings[target_event_idx]

# Find similar events
similar_events = embedding_analyzer.find_similar_events(
    target_event, 
    event_embeddings, 
    top_n=5, 
    threshold=0.6
)

# Print analysis results
print("\nTarget Event Details:")
print(crime_df.iloc[target_event_idx])

print("\nSimilar Events:")
for idx, similarity in similar_events:
    print(f"\nEvent Index: {idx}")
    print(f"Similarity Score: {similarity:.4f}")
    print(crime_df.iloc[idx])


Target Event Details:
location_type                                                  Rural
crime_type                                                  Burglary
time_of_day                                                  Evening
severity_score                                              5.546861
economic_impact                                          8929.534234
narrative          A assault occurred in a public space area. The...
Name: 42, dtype: object

Similar Events:

Event Index: 42
Similarity Score: 1.0000
location_type                                                  Rural
crime_type                                                  Burglary
time_of_day                                                  Evening
severity_score                                              5.546861
economic_impact                                          8929.534234
narrative          A assault occurred in a public space area. The...
Name: 42, dtype: object

Event Index: 65
Similarity Score: 1.0000


In [6]:
import joblib

# Assuming `embedding_analyzer` is an instance of AdvancedCrimeEmbedding
# Save the categorical encoder
joblib.dump(embedding_analyzer.categorical_encoder, 'categorical_encoder.pkl')

['categorical_encoder.pkl']

In [12]:
import pandas as pd
import torch

# Load the saved categorical encoder
loaded_encoder = joblib.load('categorical_encoder.pkl')

# Example new data point
new_crime_event = {
    'location_type': 'Commercial',
    'crime_type': 'Theft',
    'time_of_day': 'Night',
    'severity_score': 7.5,
    'economic_impact': 5000.0,
    'narrative': 'A theft occurred in a commercial area during the night.'
}

# Convert the new data point to a DataFrame
new_data_df = pd.DataFrame([new_crime_event])

# Transform the categorical features using the loaded encoder
categorical_features = new_data_df[['location_type', 'crime_type', 'time_of_day', 'severity_score', 'economic_impact']]
categorical_features_transformed = loaded_encoder.transform(categorical_features)

# Convert to torch tensor
categorical_tensor = torch.tensor(categorical_features_transformed, dtype=torch.float32)

# Generate text embedding using the existing method
narrative_embedding = embedding_analyzer._get_text_embedding(new_crime_event['narrative'])

# Combine text and categorical embeddings
combined_embedding = torch.cat([
    torch.tensor(narrative_embedding, dtype=torch.float32),
    categorical_tensor.flatten()
])

print("Generated Embedding for New Data Point:", combined_embedding)

Generated Embedding for New Data Point: tensor([-1.3883e-02,  9.1414e-02, -7.1680e-03,  3.3354e-03,  8.5866e-02,
         3.3370e-02,  8.1727e-02,  3.4538e-02,  9.2770e-02, -4.1747e-02,
         4.7878e-02,  3.2282e-02,  4.2338e-02,  1.0754e-03,  2.0571e-02,
        -1.1711e-01,  6.3691e-02,  1.8742e-02,  1.6135e-02,  1.7136e-02,
         5.9727e-02, -5.0332e-03,  2.3877e-02,  3.2599e-02,  2.8707e-02,
         6.7478e-03,  4.1970e-02,  4.2005e-02, -1.1298e-02, -4.7583e-02,
        -5.4207e-02,  5.2447e-02,  8.1979e-02,  4.1740e-02,  4.7029e-02,
        -6.8248e-02, -9.5442e-04, -1.9317e-02,  4.6392e-02,  2.3006e-02,
         3.5535e-02, -1.0715e-01, -1.4805e-02, -8.5666e-02, -7.7656e-02,
         4.2707e-02,  6.0438e-02,  5.6861e-02,  5.0609e-02,  5.0129e-03,
         8.2626e-03,  5.8175e-02, -4.7588e-02,  4.2784e-02, -5.7733e-02,
        -4.9842e-02,  9.6686e-02,  1.1533e-02,  9.9899e-02,  1.4733e-02,
         8.6500e-02,  6.6589e-02,  3.0043e-02,  8.0688e-02,  6.5337e-02,
         5.

In [13]:
categorical_tensor

tensor([[0.0000e+00, 0.0000e+00, 1.0000e+00, 3.2797e+00, 3.3672e+03]])