In [1]:
import pandas as pd
import requests
import json
import os
from time import sleep

# Create directory for split CSVs if it doesn't exist
os.makedirs('splitcsvsaved', exist_ok=True)

def get_llm_classification(text, retry_count=3):
    url = "http://localhost:1234/v1/chat/completions"
    

    prompt = f"""Classify this cyber crime description into exactly one category and one subcategory.
    Description: {text}
    
    Respond in this exact format without any additional text:
    Crime Against Women & Children|Sexual Harassment

    Only use these exact options:
    Category must be: Crime Against Women & Children
    Subcategory must be one of:
    - Computer Generated CSAM/CSEM
    - Cyber Blackmailing & Threatening  
    - Sexual Harassment
    Note: If the description mentions child exploitation content, always classify as Computer Generated CSAM/CSEM"""
    headers = {
        "Content-Type": "application/json"
    }
    
    data = {
        "messages": [
            {"role": "system", "content": "You are a cyber crime classification assistant. Respond only with category|subcategory format."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.1,
        "max_tokens": 50
    }

    for attempt in range(retry_count):
        try:
            response = requests.post(url, headers=headers, json=data, timeout=(5, 10))  # 5s connect, 10s read timeout
            response.raise_for_status()
            result = response.json()['choices'][0]['message']['content'].strip()
            
            if '|' not in result:
                raise ValueError("Invalid response format")
                
            category, subcategory = result.split('|', 1)
            return category.strip(), subcategory.strip()
            
        except requests.Timeout:
            print(f"Timeout on attempt {attempt + 1} of {retry_count}")
            if attempt < retry_count - 1:
                sleep(2)  # Wait 2 seconds before retry
            continue
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {e}")
            if attempt < retry_count - 1:
                sleep(2)
            continue
            
    return "Crime Against Women & Children", "Sexual Harassment"  # Default fallback

# Read the CSV file
df = pd.read_csv('modified_file.csv')

# Create new columns for imputed values
df['category'] = "Crime Against Women & Children"
df['sub_category'] = None

# Process rows in batches of 50
batch_size = 50
total_rows = len(df)

for start_idx in range(0, total_rows, batch_size):
    end_idx = min(start_idx + batch_size, total_rows)
    batch_df = df.iloc[start_idx:end_idx].copy()
    
    # Process each row in the batch
    for idx, row in batch_df.iterrows():
        try:
            text = str(row['crimeaditionalinfo'])
            _, subcategory = get_llm_classification(text)
            batch_df.at[idx, 'sub_category'] = subcategory
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            batch_df.at[idx, 'sub_category'] = "Sexual Harassment"
    
    # Save batch to split CSV
    batch_filename = f'splitcsvsaved/batch_{start_idx+1}_to_{end_idx}.csv'
    batch_df.to_csv(batch_filename, index=False)
    print(f"Saved batch {start_idx+1} to {end_idx} in {batch_filename}")
    
    # Update main dataframe
    df.iloc[start_idx:end_idx] = batch_df
    
# Save the complete processed file
df.to_csv('categorized_crimes_complete.csv', index=False)

Saved batch 1 to 50 in splitcsvsaved/batch_1_to_50.csv
Saved batch 51 to 100 in splitcsvsaved/batch_51_to_100.csv
Saved batch 101 to 150 in splitcsvsaved/batch_101_to_150.csv
Saved batch 151 to 200 in splitcsvsaved/batch_151_to_200.csv
Saved batch 201 to 250 in splitcsvsaved/batch_201_to_250.csv
Saved batch 251 to 300 in splitcsvsaved/batch_251_to_300.csv
Saved batch 301 to 350 in splitcsvsaved/batch_301_to_350.csv
Error on attempt 1: Invalid response format
Error on attempt 2: Invalid response format
Error on attempt 3: Invalid response format
Saved batch 351 to 400 in splitcsvsaved/batch_351_to_400.csv
Saved batch 401 to 450 in splitcsvsaved/batch_401_to_450.csv
Saved batch 451 to 500 in splitcsvsaved/batch_451_to_500.csv
Saved batch 501 to 550 in splitcsvsaved/batch_501_to_550.csv
Saved batch 551 to 600 in splitcsvsaved/batch_551_to_600.csv
Saved batch 601 to 650 in splitcsvsaved/batch_601_to_650.csv
Saved batch 651 to 700 in splitcsvsaved/batch_651_to_700.csv
Saved batch 701 to 750