# Example: Security Data

This is an example for the keyword extraction and topic modeling function inside of tactic. Please be advised that the similarity based designator matching is more in a beta stage and subject to further improvements.
This notebook will apply clustering to a synthetic dataset of mock safety reports in a company. 

## Import Section

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
from clustering_pipeline import ClusteringPipeline
from importlib import resources


## Load Data

In [None]:

PACKAGE_NAME = 'tactic.example_data'
RESOURCE_NAME = 'mock_security_incidents.csv'


with resources.files(PACKAGE_NAME).joinpath(RESOURCE_NAME).open('r') as f:
    df = pd.read_csv(f, , encoding="latin1")

print(f"Loaded {len(df)} records")
print(f"Columns: {df.columns.tolist()}")

# Cluster



### Overview

The workflow performs the following steps:
 
### Initial data exploration**
   - Inspect incident type distribution
   - Inspect designator distribution
   - View sample incidents



In [None]:
print("\nIncident Type Distribution:")
print(df['IncidentType'].value_counts())

print("\n\nDesignator Distribution:")
print(df['Designator'].value_counts())

print("\n\nSample Incidents:")
print(df[['IncidentID', 'IncidentType', 'Designator', 'Narrative']].head(10))



from clustering_pipeline import ClusteringPipeline

# Initialize pipeline with 'Narrative' as the text column
pipeline = ClusteringPipeline(df, text_column='Narrative')
print("✓ Pipeline initialized")


### Preprocessing
   - Clean text
   - Apply custom stopwords
   - Optionally remove low-IDF terms



In [None]:

print("Preprocessing text data...")
pipeline.preprocess_data(
    custom_stopwords=['incident', 'reported', 'employee', 'found'],
    low_idf=True,
    idf_threshold=1.4
)
print("✓ Preprocessing complete")


### Clustering and Keyword Extraction
   - Cluster incidents with UMAP + HDBSCAN using fixed parameters
   - Extract top keywords per cluster (TF-IDF and YAKE)


In [None]:


print("\nMethod 1: Clustering with Keyword Extraction")
print("="*60)

results = pipeline.cluster_and_extract_keywords(
    min_cluster_size=15,
    n_neighbors=20,
    n_components=2,
    tf_top_n=5,
    yake_top_n=10,
    yake_final_n=5, 
    random_state=42
)

print("\nCluster Distribution:")
print(results['cluster_summary'])

print("\n\nKeywords by Cluster:")
print(results['keywords'].head(15))



### Cluster Composition Analysis
   - Inspect incident types per cluster
   - Calculate percentages per cluster


In [None]:

df_with_clusters = results['dataframe']

print("\nCluster Composition (Incident Types per Cluster):")
print("="*60)

for cluster_id in sorted(df_with_clusters['Clusters'].unique()):
    if cluster_id == -1:
        continue  
    
    cluster_data = df_with_clusters[df_with_clusters['Clusters'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_data)} incidents):")
    
    incident_types = cluster_data['IncidentType'].value_counts()
    for incident_type, count in incident_types.items():
        percentage = (count / len(cluster_data)) * 100
        print(f"  {incident_type}: {count} ({percentage:.1f}%)")




### LDA Topic Modeling
   - Initialize a separate pipeline for LDA analysis
   - Define a dictionary of designators for topic mapping
   - Cluster and analyze topics




In [None]:

pipeline_lda = ClusteringPipeline(df, text_column='Narrative')
pipeline_lda.preprocess_data(
    custom_stopwords=['incident', 'reported', 'employee', 'found']
)

print("✓ New pipeline initialized for LDA analysis")

designators = {
    'THEFT': 'stolen value valued room approximately suspect item missing laptop loss breakin multiple total warehouse phone took incident equipment shoplifting location apprehended retail personal estimated worth',
    'VANDALISM': 'floor cleanup repair building estimated camera damage elevator buttons damaged smashed required covered area hallway extinguisher powder discharged shattered glass broken office crew windows dispatched',
    'ACCESS': 'access area badge employee unauthorized security building denied attempted visitor individual restricted main lost identified wandering entry deactivated notified used floor vehicle supervisor team open',
    'SUSPICIOUS': 'individual security suspicious parking lot person near left floor observed package activity reported building questions behavior asking concerning lobby systems detailed monitored loading dock minutes',
    'VIOLENCE': 'security employee threatening employees parties physical separated break room meeting altercation supervisor statements escorted premises hr notified individual threat verbal broke suspended individuals investigation workplace',
    'SAFETY': 'building area hazard power electrical shut evacuated called leak discovered electrician notified gas floor triggered team hazmat chemical alarm kitchen spill laboratory given clear smoke',
    'POLICY': 'employee policy violation company written area sent used unauthorized vehicle personal logged fleet use issued near parties entrance prohibited detected control sharing smoking building badge',
    'CYBER': 'employee phishing reported security suspicious provided detected password fell reset forced scam credentials team attempt engineering social request phone passwords quarantined workstation computer malware department',
    'MEDICAL': 'employee care urgent supervisor drove dizziness administered reported severe called headache treated room epipen medical meeting reaction having fainted onsite stable paramedics arrived conference injured',
    'FRAUD': 'control documents reports inspection forged discovered falsified quality charges fraud'
}

print("Designators defined:")
for key, value in designators.items():
    print(f"  {key}: {value}")

print("\nMethod 2: Clustering with LDA Topic Analysis")
print("="*60)

topic_results = pipeline_lda.cluster_and_analyze_topics(
    min_cluster_size=15,
    num_topics=10,
    passes=15,
    designators=designators
)

print("\nCluster Summary:")
print(topic_results['cluster_summary'])




### View Topics and Cluster-to-Topic Mappings
   - Display top words for each topic
   - Map clusters to dominant topics
   - Match topics to designators

In [None]:

lda_model = topic_results['topics']['model']

print("\nLDA Topics and Top Words:")
print("="*60)

for topic_id in range(10):
    try:
        topic_words = lda_model.lda_model.show_topic(topic_id, topn=10)
        print(f"\nTopic {topic_id}:")
        words = ", ".join([f"{word}({prob:.2f})" for word, prob in topic_words[:7]])
        print(f"  {words}")
    except:
        pass


print("\nCluster to Topic Mappings:")
print("="*60)

for cluster_id, topic_ids in topic_results['topics']['cluster_topics'].items():
    if isinstance(topic_ids, int):
        topic_ids = [topic_ids]
    
    print(f"\nCluster {cluster_id}:")
    for topic_id in topic_ids[:2]:  # Show top 2 topics
        try:
            topic_words = lda_model.lda_model.show_topic(topic_id, topn=5)
            words = ", ".join([word for word, _ in topic_words])
            print(f"  Topic {topic_id}: {words}")
        except:
            pass


print("\nTopic to Designator Similarity Scores:")
print("="*60)

for topic_id in range(10):
    matches = topic_results['topics']['topic_designators'].get(topic_id, [])
    if matches:
        print(f"\nTopic {topic_id}:")
        for designator, similarity in matches[:3]:
            print(f"  {designator}: {similarity:.3f}")



### Visualization
   - Plot cluster distributions across incidents

In [None]:
cluster_counts = topic_results['cluster_summary']['Size'].sort_values(ascending=False)

plt.figure(figsize=(12, 6))
plt.bar(range(len(cluster_counts)), cluster_counts.values)
plt.xlabel('Cluster ID')
plt.ylabel('Number of Incidents')
plt.title('Incident Distribution Across Clusters')
plt.xticks(range(len(cluster_counts)), cluster_counts.index, rotation=45)
plt.tight_layout()
plt.show()

print("✓ Cluster distribution plotted")


### Validation
   - Check cluster purity against known designators

In [None]:
df_final = topic_results['dataframe']

print("\nValidation: Cluster Purity Check")
print("="*60)

for cluster_id in sorted(df_final['Clusters'].unique()):
    if cluster_id == -1:
        continue
    
    cluster_data = df_final[df_final['Clusters'] == cluster_id]
    
    # Find dominant designator
    designator_counts = cluster_data['Designator'].value_counts()
    dominant_designator = designator_counts.index[0]
    dominant_count = designator_counts.iloc[0]
    purity = (dominant_count / len(cluster_data)) * 100
    
    print(f"\nCluster {cluster_id}:")
    print(f"  Dominant Designator: {dominant_designator}")
    print(f"  Purity: {purity:.1f}% ({dominant_count}/{len(cluster_data)})")
    print(f"  All Designators: {dict(designator_counts)}")


### Export Results
    - Save clustered dataset
    - Save cluster summary

In [None]:
print("\nExporting Results...")

# Save clustered data
df_final.to_csv('incidents_with_clusters.csv', index=False)
print("✓ Saved: incidents_with_clusters.csv")

# Save cluster summary
topic_results['cluster_summary'].to_csv('cluster_summary.csv')
print("✓ Saved: cluster_summary.csv")

