In [1]:
# Install required packages
!pip install spacy PyPDF2 scikit-learn folium
!python -m spacy download en_core_web_sm

import spacy
import numpy as np
from sklearn.cluster import KMeans
import folium
from PyPDF2 import PdfReader
from io import BytesIO
from collections import defaultdict
import ipywidgets as widgets
from IPython.display import display, HTML

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Crime keywords to identify
crime_keywords = ["robbery", "assault", "burglary", "theft", "homicide", "murder",
                 "shooting", "stabbing", "kidnapping", "vandalism", "fraud",
                 "drug", "arrest", "violence", "domestic", "abuse"]

# Widget for file upload
uploader = widgets.FileUpload(
    accept='.pdf',
    multiple=True,
    description='Upload PDF Reports'
)

display(uploader)

def process_pdf(file_bytes):
    """Extract text from PDF file bytes"""
    with BytesIO(file_bytes) as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def analyze_report(report_text):
    """Analyze report text and extract entities"""
    doc = nlp(report_text)

    # Extract entities with their labels
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Detect crime type
    crime_type = next((kw for kw in crime_keywords if kw in report_text.lower()), None)

    # Extract locations (GPE, LOC, FAC)
    locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC", "FAC"]]

    # Extract suspects (PERSON)
    suspects = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Extract dates (DATE)
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]

    return {
        "entities": entities,
        "crime_type": crime_type,
        "locations": locations,
        "suspects": suspects,
        "dates": dates,
        "original_text": report_text
    }

def geocode_location(location_name):
    """Mock geocoding function - in a real app, use Google Maps API or similar"""
    # This is a simplified mock - real implementation would use a geocoding service
    base_coords = (40.7128, -74.0060)  # Default to NYC coordinates
    variation = (hash(location_name) % 1000) / 100000
    return (base_coords[0] + variation, base_coords[1] - variation)

def process_reports(uploader):
    """Process all uploaded reports"""
    extracted_data = []
    location_coordinates = {}

    for name, file_info in uploader.value.items():
        report_text = process_pdf(file_info['content'])
        analysis = analyze_report(report_text)
        extracted_data.append(analysis)

        # Geocode locations
        for location in analysis['locations']:
            if location not in location_coordinates:
                location_coordinates[location] = geocode_location(location)

    return extracted_data, location_coordinates

def display_results(extracted_data, location_coordinates):
    """Display analysis results"""
    # Create a tabular display of the results
    html_output = """
    <style>
        .report { border: 1px solid #ddd; padding: 10px; margin-bottom: 20px; border-radius: 5px; }
        .entities { display: flex; flex-wrap: wrap; gap: 10px; margin: 10px 0; }
        .entity { background: #f0f0f0; padding: 5px 10px; border-radius: 3px; }
        .crime-type { font-weight: bold; color: #d32f2f; }
        .location { color: #1976d2; }
        .suspect { color: #388e3c; }
        .date { color: #f57c00; }
        table { border-collapse: collapse; width: 100%; margin: 10px 0; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #f2f2f2; }
    </style>
    <h2>Crime Report Analysis Results</h2>
    """

    for i, report in enumerate(extracted_data, 1):
        html_output += f"""
        <div class="report">
            <h3>Report #{i}</h3>
            <div><strong>Crime Type:</strong> <span class="crime-type">{report['crime_type'] or 'Unknown'}</span></div>

            <h4>Entities:</h4>
            <div class="entities">
                <div class="entity location"><strong>Locations:</strong> {', '.join(report['locations']) or 'None'}</div>
                <div class="entity suspect"><strong>Suspects:</strong> {', '.join(report['suspects']) or 'None'}</div>
                <div class="entity date"><strong>Dates:</strong> {', '.join(report['dates']) or 'None'}</div>
            </div>

            <h4>All Entities:</h4>
            <table>
                <tr><th>Entity</th><th>Label</th></tr>
        """

        for entity, label in report['entities']:
            html_output += f"<tr><td>{entity}</td><td>{label}</td></tr>"

        html_output += """
            </table>

            <details>
                <summary>View Original Text</summary>
                <div style="white-space: pre-wrap; background: #f9f9f9; padding: 10px; margin-top: 10px;">{}</div>
            </details>
        </div>
        """.format(report['original_text'])

    display(HTML(html_output))

    # Generate hotspot map if we have locations
    coords = [coord for loc, coord in location_coordinates.items()]

    if coords:
        coords_array = np.array(coords)

        # Determine optimal number of clusters (but cap at 5)
        n_clusters = min(5, max(1, len(coords_array) // 3))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(coords_array)
        cluster_centers = kmeans.cluster_centers_

        # Create map
        map_center = [np.mean(coords_array[:, 0]), np.mean(coords_array[:, 1])]
        crime_map = folium.Map(location=map_center, zoom_start=14)

        # Add crime locations
        for (lat, lon), loc_name in zip(coords_array, location_coordinates.keys()):
            folium.CircleMarker(
                location=[lat, lon],
                radius=5,
                color='blue',
                fill=True,
                popup=loc_name
            ).add_to(crime_map)

        # Add hotspots
        for i, center in enumerate(cluster_centers, 1):
            folium.Marker(
                location=center,
                icon=folium.Icon(color='red', icon='fire'),
                popup=f'Hotspot #{i}'
            ).add_to(crime_map)

        # Display map in Colab
        display(crime_map)

        # Save map
        crime_map.save("crime_hotspots.html")
        print("Crime hotspot map saved as 'crime_hotspots.html'")
    else:
        print("No location coordinates were found for hotspot prediction.")

# Button to process reports
process_btn = widgets.Button(description="Analyze Reports")
output = widgets.Output()

def on_process_click(b):
    with output:
        output.clear_output()
        if not uploader.value:
            print("Please upload PDF files first")
            return

        print("Processing reports...")
        extracted_data, location_coordinates = process_reports(uploader)
        display_results(extracted_data, location_coordinates)

process_btn.on_click(on_process_click)
display(process_btn)
display(output)

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


FileUpload(value={}, accept='.pdf', description='Upload PDF Reports', multiple=True)

Button(description='Analyze Reports', style=ButtonStyle())

Output()