In [1]:
pip install transformers torch scikit-learn pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the crime data from a CSV file
# Assume the dataset contains columns: ['description', 'latitude', 'longitude', 'crime_type', 'date']
crime_data = pd.read_csv('crime_data.csv')

# Display the first few rows of the data
print(crime_data.head())


   Year  Population  Murder  Rape  Robbery  Assault  Burglary  CarTheft
0  1965    18073000     836  2320    28182    27464    183443     58452
1  1966    18258000     882  2439    30098    29142    196127     64368
2  1967    18336000     996  2665    40202    31261    219157     83775
3  1968    18113000    1185  2527    59857    34946    250918    104877
4  1969    18321000    1324  2902    64754    36890    248477    115400


In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Basic text cleaning: lowercase, remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    return text

# Check if 'description' column exists
if 'description' in crime_data.columns:
    # Clean the crime descriptions
    crime_data['clean_description'] = crime_data['description'].apply(clean_text)

    # Tokenization and removing stopwords
    stop_words = set(stopwords.words('english'))
    crime_data['tokens'] = crime_data['clean_description'].apply(
        lambda x: [word for word in word_tokenize(x) if word not in stop_words])

    # Display cleaned text data
    print(crime_data[['description', 'clean_description']].head())
else:
    print("Column 'description' does not exist in crime_data. Available columns are:", crime_data.columns)

Column 'description' does not exist in crime_data. Available columns are: Index(['Year', 'Population', 'Murder', 'Rape', 'Robbery', 'Assault',
       'Burglary', 'CarTheft'],
      dtype='object')


In [4]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install tenacity




In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from huggingface_hub import login
from tenacity import retry, stop_after_attempt, wait_fixed

# Correct model identifier (use a known working model)
model_identifier = 'gpt2'

# Your Hugging Face access token
access_token = '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'

# Log in to Hugging Face
login(token=access_token, add_to_git_credential=True)

@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def load_model_and_tokenizer(model_identifier, access_token):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_identifier, use_auth_token=access_token)
        model = AutoModelForCausalLM.from_pretrained(model_identifier, use_auth_token=access_token)
        
        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model and tokenizer: {e}")
        raise

# Load the tokenizer and model with retry logic
tokenizer, model = load_model_and_tokenizer(model_identifier, access_token)

# Example crime_data DataFrame
crime_data = pd.DataFrame({
    'clean_description': ["description1", "description2"]
})

# Tokenize the cleaned crime descriptions
inputs = tokenizer(crime_data['clean_description'].tolist(), return_tensors='pt', padding=True, truncation=True)

# Generate embeddings for the crime descriptions
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.logits

# Store the embeddings
crime_data['embeddings'] = embeddings.cpu().numpy()

  from .autonotebook import tqdm as notebook_tqdm


Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\sairaj\.cache\huggingface\token
Login successful




In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import numpy as np
from huggingface_hub import login
from tenacity import retry, stop_after_attempt, wait_fixed

# Correct model identifier (use a known working model)
model_identifier = 'gpt2'

# Your Hugging Face access token
access_token = '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'

# Log in to Hugging Face
login(token=access_token, add_to_git_credential=True)

@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def load_model_and_tokenizer(model_identifier, access_token):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_identifier, use_auth_token=access_token)
        model = AutoModelForCausalLM.from_pretrained(model_identifier, use_auth_token=access_token)
        
        # Add padding token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model and tokenizer: {e}")
        raise

# Load the tokenizer and model with retry logic
tokenizer, model = load_model_and_tokenizer(model_identifier, access_token)

# Example crime_data DataFrame
crime_data = pd.DataFrame({
    'clean_description': ["description1", "description2"],
    'latitude': [34.0522, 36.1699],
    'longitude': [-118.2437, -115.1398],
    'crime_type': ['theft', 'assault']
})

# Tokenize the cleaned crime descriptions
inputs = tokenizer(crime_data['clean_description'].tolist(), return_tensors='pt', padding=True, truncation=True)

# Generate embeddings for the crime descriptions
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.logits

# Convert embeddings to a 2D array
embeddings = embeddings.mean(dim=1).cpu().numpy()

# Combine embeddings with geographical data (latitude, longitude)
features = np.hstack([embeddings, crime_data[['latitude', 'longitude']].values])

# The target variable could be future crime occurrences, crime types, etc.
# For simplicity, we'll assume we're predicting the type of crime
target = crime_data['crime_type']

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\sairaj\.cache\huggingface\token
Login successful




In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy}")


Prediction Accuracy: 0.0


In [10]:
# New sample data for prediction (new crime descriptions and locations)
new_data = {
    'description': ["Suspicious activity near a shopping mall", "Attempted theft in residential area"],
    'latitude': [41.881832, 41.878113],
    'longitude': [-87.623177, -87.629799]
}

# Preprocess and tokenize the new data
new_data_df = pd.DataFrame(new_data)
new_data_df['clean_description'] = new_data_df['description'].apply(clean_text)
new_inputs = tokenizer(new_data_df['clean_description'].tolist(), return_tensors='pt', padding=True, truncation=True)

# Generate embeddings for the new data
with torch.no_grad():
    new_outputs = model(**new_inputs)
    new_embeddings = new_outputs.logits

# Convert new embeddings to a 2D array
new_embeddings = new_embeddings.mean(dim=1).cpu().numpy()

# Combine new embeddings with geographical data
new_features = np.hstack([new_embeddings, new_data_df[['latitude', 'longitude']].values])

# Predict crime types or hotspot potential for new data
new_predictions = clf.predict(new_features)
print("Predicted Crime Types:", new_predictions)

Predicted Crime Types: ['theft' 'theft']
