In [34]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

class CrimeEventEmbeddingAnalyzer(nn.Module):
    
    def __init__(self, embedding_dim=50):
        """
        Comprehensive Crime Event Embedding Analyzer
        
        Args:
            embedding_dim: Dimensionality of embedding vector
        """
        super(CrimeEventEmbeddingAnalyzer, self).__init__() 
        
        # Feature encoding dictionaries
        self.encoders = {}
        self.embedding_dim = embedding_dim
        
        # Embedding layers for different feature types
        self.embeddings = nn.ModuleDict({
            'location_type': nn.Embedding(100, embedding_dim),
            'crime_type': nn.Embedding(100, embedding_dim),
            'time_category': nn.Embedding(24, embedding_dim),
            'weapon_type': nn.Embedding(50, embedding_dim),
            'suspect_age_group': nn.Embedding(10, embedding_dim)
        })
        
        # Weights for different feature importance
        self.feature_weights = {
            'location_type': 1.0,
            'crime_type': 1.5,
            'time_category': 0.7,
            'weapon_type': 1.2,
            'suspect_age_group': 0.5
        }
    
    def prepare_dataset(self, df):
        """
        Prepare the dataset by encoding categorical features
        
        Args:
            df: Input DataFrame with crime event features
        
        Returns:
            Processed DataFrame with encoded features
        """
        # Create label encoders for categorical features
        encoded_df = df.copy()
        
        # Encode categorical features
        categorical_features = [
            'location_type', 'crime_type', 'time_category', 
            'weapon_type', 'suspect_age_group'
        ]
        
        for feature in categorical_features:
            # Create or retrieve label encoder
            if feature not in self.encoders:
                self.encoders[feature] = LabelEncoder()
            
            # Fit and transform the feature
            encoded_df[f'{feature}_encoded'] = self.encoders[feature].fit_transform(
                encoded_df[feature].fillna('Unknown')
            )
        
        return encoded_df
    
    def create_event_embedding(self, event_row):
        """
        Create embedding for a single crime event
        
        Args:
            event_row: Encoded event features
        
        Returns:
            Combined event embedding vector
        """
        feature_embeddings = []
        
        for feature, embedding_layer in self.embeddings.items():
            # Get encoded feature value
            encoded_feature = event_row[f'{feature}_encoded']
            
            # Create embedding
            feature_emb = embedding_layer(torch.tensor(encoded_feature))
            
            # Apply feature weight
            weighted_emb = feature_emb * self.feature_weights[feature]
            feature_embeddings.append(weighted_emb)
        
        # Combine embeddings
        return torch.mean(torch.stack(feature_embeddings), dim=0)
    
    def calculate_similarity(self, embedding1, embedding2, method='cosine'):
        """
        Calculate similarity between two event embeddings
        
        Args:
            embedding1: First event embedding
            embedding2: Second event embedding
            method: Similarity calculation method
        
        Returns:
            Similarity score
        """
        if method == 'cosine':
            return torch.nn.functional.cosine_similarity(
                embedding1.unsqueeze(0), 
                embedding2.unsqueeze(0)
            ).item()
        elif method == 'euclidean':
            return -torch.dist(embedding1, embedding2).item()
        else:
            raise ValueError("Unsupported similarity method")
    
    def find_similar_events(self, target_event, event_embeddings, top_n=5, threshold=0.7):
        """
        Find most similar events to a target event
        
        Args:
            target_event: Embedding of the target event
            event_embeddings: List of event embeddings to compare
            top_n: Number of top similar events to return
            threshold: Minimum similarity threshold
        
        Returns:
            List of most similar events
        """
        # Calculate similarities
        similarities = [
            (i, self.calculate_similarity(target_event, emb)) 
            for i, emb in enumerate(event_embeddings)
        ]
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Filter by threshold and return top N
        return [
            (idx, sim) for idx, sim in similarities[:top_n] 
            if sim > threshold
        ]

    def prepare_dataset2(self, df):
        """
        Prepare the dataset by encoding categorical features
        
        Args:
            df: Input DataFrame with crime event features
        
        Returns:
            Processed DataFrame with encoded features
        """
        # Create a copy of the DataFrame to avoid modifying the original
        encoded_df = df.copy()
        
        # List of categorical features
        categorical_features = [
            'location_type', 'crime_type', 'time_category', 
            'weapon_type', 'suspect_age_group'
        ]
        
        for feature in categorical_features:
            # Use existing label encoder
            if feature in self.encoders:
                # Transform the feature
                encoded_df[f'{feature}_encoded'] = self.encoders[feature].transform(
                    encoded_df[feature].fillna('Unknown')
                )
            else:
                raise ValueError(f"Encoder for feature '{feature}' not found.")
        
        return encoded_df


def generate_synthetic_crime_dataset(n_samples=1000):
    """
    Generate a synthetic crime dataset
    
    Args:
        n_samples: Number of crime events to generate
    
    Returns:
        Pandas DataFrame with crime event features
    """
    np.random.seed(42)
    
    # Possible categorical values
    location_types = ['Residential', 'Commercial', 'Public Space', 'Industrial', 'Rural']
    crime_types = ['Theft', 'Assault', 'Burglary', 'Fraud', 'Vandalism']
    time_categories = [f'{h:02d}:00' for h in range(24)]
    weapon_types = ['Firearm', 'Knife', 'Blunt Object', 'None']
    age_groups = ['Juvenile', 'Young Adult', 'Adult', 'Senior']
    
    # Generate synthetic data
    data = {
        'location_type': np.random.choice(location_types, n_samples),
        'crime_type': np.random.choice(crime_types, n_samples),
        'time_category': np.random.choice(time_categories, n_samples),
        'weapon_type': np.random.choice(weapon_types, n_samples),
        'suspect_age_group': np.random.choice(age_groups, n_samples),
        'severity_score': np.random.uniform(1, 10, n_samples)
    }
    
    return pd.DataFrame(data)



# def main():
#     # Generate synthetic crime dataset
#     crime_df = generate_synthetic_crime_dataset(n_samples=1000)
    
#     # Initialize embedding analyzer
#     analyzer = CrimeEventEmbeddingAnalyzer()
    
#     # Prepare dataset
#     encoded_df = analyzer.prepare_dataset(crime_df)
    
#     # Create embeddings for all events
#     event_embeddings = [
#         analyzer.create_event_embedding(row) 
#         for _, row in encoded_df.iterrows()
#     ]
    

In [35]:
crime_df = generate_synthetic_crime_dataset(n_samples=1000)
crime_df

Unnamed: 0,location_type,crime_type,time_category,weapon_type,suspect_age_group,severity_score
0,Industrial,Fraud,06:00,Firearm,Juvenile,5.920167
1,Rural,Fraud,10:00,Blunt Object,Juvenile,6.178684
2,Public Space,Fraud,09:00,,Senior,1.199273
3,Rural,Fraud,08:00,Firearm,Senior,9.506028
4,Rural,Vandalism,13:00,Firearm,Senior,7.269645
...,...,...,...,...,...,...
995,Commercial,Theft,20:00,Knife,Juvenile,7.084945
996,Residential,Vandalism,20:00,Blunt Object,Juvenile,3.765151
997,Residential,Theft,04:00,Firearm,Young Adult,4.330221
998,Industrial,Vandalism,15:00,Blunt Object,Young Adult,5.954246


In [36]:
# Initialize embedding analyzer
analyzer = CrimeEventEmbeddingAnalyzer()

# Prepare dataset
encoded_df = analyzer.prepare_dataset(crime_df)

encoded_df

Unnamed: 0,location_type,crime_type,time_category,weapon_type,suspect_age_group,severity_score,location_type_encoded,crime_type_encoded,time_category_encoded,weapon_type_encoded,suspect_age_group_encoded
0,Industrial,Fraud,06:00,Firearm,Juvenile,5.920167,1,2,6,1,1
1,Rural,Fraud,10:00,Blunt Object,Juvenile,6.178684,4,2,10,0,1
2,Public Space,Fraud,09:00,,Senior,1.199273,2,2,9,3,2
3,Rural,Fraud,08:00,Firearm,Senior,9.506028,4,2,8,1,2
4,Rural,Vandalism,13:00,Firearm,Senior,7.269645,4,4,13,1,2
...,...,...,...,...,...,...,...,...,...,...,...
995,Commercial,Theft,20:00,Knife,Juvenile,7.084945,0,3,20,2,1
996,Residential,Vandalism,20:00,Blunt Object,Juvenile,3.765151,3,4,20,0,1
997,Residential,Theft,04:00,Firearm,Young Adult,4.330221,3,3,4,1,3
998,Industrial,Vandalism,15:00,Blunt Object,Young Adult,5.954246,1,4,15,0,3


In [37]:
import pickle

# Save the encoders
with open('encoders.pkl', 'wb') as f:
    pickle.dump(analyzer.encoders, f)

In [38]:
event_embeddings = [
    analyzer.create_event_embedding(row) 
    for _, row in encoded_df.iterrows()
]

event_embeddings

[tensor([ 0.9021,  0.8101,  0.0451, -0.1630,  0.1863,  0.2542,  1.1546,  1.0854,
         -0.3779, -0.0551,  1.0901, -0.5634, -0.0315,  0.7961,  0.1685,  0.2091,
          1.0128, -0.5644,  0.9004, -0.0483,  0.0407, -0.2658,  0.0312,  0.0220,
          0.7397,  1.0580, -0.0248,  0.1590,  0.7005, -0.0071,  0.4448,  0.4533,
         -0.2761, -0.2910,  0.2247,  0.3046,  0.2789, -0.2557,  0.6244, -0.8657,
         -0.5080,  0.3003, -1.0882, -0.0333, -0.1860, -0.2028, -0.0507,  0.4722,
         -0.4845, -1.0096], grad_fn=<MeanBackward1>),
 tensor([ 0.4994,  0.2940, -0.0811, -0.1730,  0.4535,  0.4744,  0.4125,  0.9637,
          0.2276, -0.3479,  0.1850, -0.0771,  0.0044,  1.1011,  0.1031, -0.0637,
          0.4217, -0.2755,  0.2346,  0.6881, -0.2455, -0.3760, -0.2813,  0.1488,
          0.0396,  0.8378, -0.0331, -0.1416,  1.2996,  0.1187,  0.0751, -0.1720,
          0.1930, -0.4864, -0.1440,  0.7692,  0.1095, -0.1993,  0.8002, -1.0886,
         -0.1045,  0.4317, -0.6427,  0.0807,  0.0401, -

In [43]:
new_data = pd.DataFrame({
    'location_type': ['Industrial'],
    'crime_type': ['Theft'],
    'time_category': ['14:00'],
    'weapon_type': ['None'],
    'suspect_age_group': ['Young Adult']
})

# Preprocess the new data
encoded_new_data = analyzer.prepare_dataset2(new_data)
display(encoded_new_data)

# Generate embeddings for the new data
new_event_embedding = analyzer.create_event_embedding(encoded_new_data.iloc[0])

print("New Event Embedding:", new_event_embedding)


Unnamed: 0,location_type,crime_type,time_category,weapon_type,suspect_age_group,location_type_encoded,crime_type_encoded,time_category_encoded,weapon_type_encoded,suspect_age_group_encoded
0,Industrial,Theft,14:00,,Young Adult,1,3,14,3,3


New Event Embedding: tensor([ 0.2792,  0.5321, -0.1854, -0.0759, -0.3394, -0.9840,  0.1008, -0.1406,
         0.5775,  0.0816,  0.7535,  0.2499, -0.3340,  0.3945, -0.2237, -0.3272,
        -0.2527, -0.6540,  0.1481,  0.3388, -0.3825, -0.0306, -0.8725, -0.5161,
         0.3664,  0.2707,  0.3822,  0.6548, -0.0777, -0.8108,  0.3822, -0.5612,
        -0.0167,  0.2017, -0.2629,  0.8099,  0.2348,  0.2037,  0.8904,  0.4970,
        -0.2693, -0.4066, -0.9623,  0.1483,  0.9510,  0.7209,  1.0128,  0.4150,
         0.1992, -0.2561], grad_fn=<MeanBackward1>)


In [44]:
# Select a target event for similarity analysis
target_event_idx = 42  # Example target event index
target_event = event_embeddings[target_event_idx]

# Find similar events
similar_events = analyzer.find_similar_events(
    target_event, 
    event_embeddings, 
    top_n=5, 
    threshold=0.6
)

# Print analysis results
print("\nTarget Event Details:")
print(crime_df.iloc[target_event_idx])

print("\nSimilar Events:")
for idx, similarity in similar_events:
    print(f"\nEvent Index: {idx}")
    print(f"Similarity Score: {similarity:.4f}")
    print(crime_df.iloc[idx])


Target Event Details:
location_type          Industrial
crime_type               Burglary
time_category               16:00
weapon_type          Blunt Object
suspect_age_group     Young Adult
severity_score           2.871607
Name: 42, dtype: object

Similar Events:

Event Index: 42
Similarity Score: 1.0000
location_type          Industrial
crime_type               Burglary
time_category               16:00
weapon_type          Blunt Object
suspect_age_group     Young Adult
severity_score           2.871607
Name: 42, dtype: object

Event Index: 880
Similarity Score: 0.9476
location_type          Industrial
crime_type               Burglary
time_category               02:00
weapon_type          Blunt Object
suspect_age_group     Young Adult
severity_score           6.230759
Name: 880, dtype: object

Event Index: 765
Similarity Score: 0.9275
location_type          Industrial
crime_type               Burglary
time_category               17:00
weapon_type          Blunt Object
suspect_age

In [45]:
# Save the model's state_dict
torch.save(analyzer.state_dict(), 'crime_event_embedding_analyzer.pth')

In [46]:
# Load the model's state_dict
analyzer = CrimeEventEmbeddingAnalyzer()
analyzer.load_state_dict(torch.load('crime_event_embedding_analyzer.pth'))
analyzer.eval()  # Set the model to evaluation mode

  analyzer.load_state_dict(torch.load('crime_event_embedding_analyzer.pth'))


CrimeEventEmbeddingAnalyzer(
  (embeddings): ModuleDict(
    (location_type): Embedding(100, 50)
    (crime_type): Embedding(100, 50)
    (time_category): Embedding(24, 50)
    (weapon_type): Embedding(50, 50)
    (suspect_age_group): Embedding(10, 50)
  )
)

In [47]:
# Load the encoders
with open('encoders.pkl', 'rb') as f:
    analyzer.encoders = pickle.load(f)

In [48]:
new_data = pd.DataFrame({
    'location_type': ['Industrial'],
    'crime_type': ['Theft'],
    'time_category': ['14:00'],
    'weapon_type': ['None'],
    'suspect_age_group': ['Young Adult']
})

# Preprocess the new data
encoded_new_data = analyzer.prepare_dataset2(new_data)
encoded_new_data

# Generate embeddings for the new data
new_event_embedding = analyzer.create_event_embedding(encoded_new_data.iloc[0])

print("New Event Embedding:", new_event_embedding)


New Event Embedding: tensor([ 0.2792,  0.5321, -0.1854, -0.0759, -0.3394, -0.9840,  0.1008, -0.1406,
         0.5775,  0.0816,  0.7535,  0.2499, -0.3340,  0.3945, -0.2237, -0.3272,
        -0.2527, -0.6540,  0.1481,  0.3388, -0.3825, -0.0306, -0.8725, -0.5161,
         0.3664,  0.2707,  0.3822,  0.6548, -0.0777, -0.8108,  0.3822, -0.5612,
        -0.0167,  0.2017, -0.2629,  0.8099,  0.2348,  0.2037,  0.8904,  0.4970,
        -0.2693, -0.4066, -0.9623,  0.1483,  0.9510,  0.7209,  1.0128,  0.4150,
         0.1992, -0.2561], grad_fn=<MeanBackward1>)
