In [None]:
# Import nltk, os, json, re, shutil, pandas, numpy, scipy, matplotlib, seaborn
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import os
import json
import re
import shutil
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Define the folder containing the JSON files
data_folder = "10K_item1a_PERMNO_2015_sic_tic"
sampled_folder = "sampled_files"
os.makedirs(sampled_folder, exist_ok=True)

# Group files by year and then by industry
industry_year_files = {}

for file_name in os.listdir(data_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)

        # Extract SIC code and year
        sic_code = int(data['SIC'])
        year = data['filing_date'][:4]  # Extract year from filing_date

        # Determine industry based on SIC code
        if 1 <= sic_code <= 999:
            industry = "Agriculture, Forestry and Fishing"
        elif 1000 <= sic_code <= 1499:
            industry = "Mining"
        elif 1500 <= sic_code <= 1799:
            industry = "Construction"
        elif 2000 <= sic_code <= 3999:
            industry = "Manufacturing"
        elif 4000 <= sic_code <= 4999:
            industry = "Transportation and other Utilities"
        elif 5000 <= sic_code <= 5199:
            industry = "Wholesale Trade"
        elif 5200 <= sic_code <= 5999:
            industry = "Retail Trade"
        elif 6000 <= sic_code <= 6799:
            industry = "Finance, Insurance and Real Estate"
        elif 7000 <= sic_code <= 8999:
            industry = "Services"
        elif 9000 <= sic_code <= 9999:
            industry = "Public Administration"
        else:
            industry = "Unknown"

        # Group files by (year, industry)
        key = (year, industry)
        if key not in industry_year_files:
            industry_year_files[key] = []
        industry_year_files[key].append((file_name, sic_code))

# Sort files within each (year, industry) group and sample 10 files per group
for (year, industry), files in sorted(industry_year_files.items()):  # Sort by year and then industry
    sampled_files = sorted(files, key=lambda x: x[0])[:10]  # Take the first 10 files based on file_name
    for file_name, sic_code in sampled_files:
        src_path = os.path.join(data_folder, file_name)

        # Rename the file before copying to avoid duplicates
        new_file_name = f"{year}_{industry.replace(' ', '_')}_{file_name}"
        dest_path = os.path.join(sampled_folder, new_file_name)

        shutil.copy(src_path, dest_path)

print(f"Sampled files have been renamed and saved in the folder: {sampled_folder}")

In [None]:
# Ensure the output folder exists
sampled_folder = "sampled_files"
output_folder = "extracted_sentences"
os.makedirs(output_folder, exist_ok=True)

# Define relevant and irrelevant keywords for each category
keywords = {
    "Attack": {
        "relevant": ["cyber-", "cyber", "networks", "systems", "products", "services", "datacenter", "infrastructure"],
        "irrelevant": ["terror", "war", "contraband", "bombs"]
    },
    "Threat": {
        "relevant": ["cyber-", "cyber", "networks", "systems", "products", "services", "datacenter", "infrastructure"],
        "irrelevant": ["terror", "simulator", "disease", "legal action", "competitors"]
    },
    "Computer, information, system": {
        "relevant": ["malware", "virus", "viruses", "intrusions"],
        "irrelevant": ["fires", "product sales", "warranty claim"]
    },
    "Malicious": {
        "relevant": ["software", "programs", "third parties", "attacks"],
        "irrelevant": []
    },
    "Breaches": {
        "relevant": [],
        "irrelevant": ["fiduciary duty", "fiduciary duties", "covenant", "credit", "agreement"]
    },
    "Hacker, hacking, social engineering, denial of service, cyberattack, cybersecurity": {
        "relevant": ["hacker", "hacking", "social engineering", "denial of service", "cyberattack", "cybersecurity"],
        "irrelevant": ["fiduciary", "warranty", "regulations", "contract"]
    }
}

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to generate regex patterns dynamically for keyword variations
def keyword_pattern(word):
    """
    Generate regex pattern to match variations of a word dynamically.
    Example: "attack" -> r"\battack(s|ing|ed)?\b"
    """
    lemma = lemmatizer.lemmatize(word)  # Normalize the word
    return rf"\b{re.escape(lemma)}(s|ing|ed)?\b"

# Function to classify a sentence as relevant or irrelevant dynamically
def classify_sentence(sentence, keywords):
    sentence = sentence.lower()  # Convert to lowercase for case-insensitive matching

    # Iterate through each category in the keywords dictionary
    for category, terms in keywords.items():
        # Check if the category keyword exists in the sentence
        if category.lower() in sentence:
            # Check for relevant keywords dynamically
            if any(re.search(keyword_pattern(term), sentence) for term in terms["relevant"]):
                return True  # Relevant if any relevant keyword is found

            # Check for irrelevant keywords dynamically
            if any(re.search(keyword_pattern(term), sentence) for term in terms["irrelevant"]):
                continue  # Skip to the next category if irrelevant keywords are found

    return False  # Irrelevant if no relevant keywords are found

# Process each JSON file in the sampled folder
for file_name in os.listdir(sampled_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(sampled_folder, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)

        # Extract the "item_1A" section and tokenize into sentences
        item_1A_text = data.get("item_1A", "")
        sentences = sent_tokenize(item_1A_text)

        # Classify sentences
        relevant_sentences = []
        for sentence in sentences:
            if classify_sentence(sentence, keywords):
                relevant_sentences.append(sentence)

        # Calculate cybersecurity risk measures
        total_sentences = len(sentences)
        # risk_measure_ratio = len(relevant_sentences) / total_sentences if total_sentences > 0 else 0
        risk_measure_count = len(relevant_sentences)

        # Save relevant sentences and risk measures to a new JSON file
        output_data = {
            "cik": data.get("cik"),
            "company": data.get("company"),
            "filing_date": data.get("filing_date"),
            "sic": data.get("SIC"),  # Include the SIC code in the output
            "relevant_sentences": relevant_sentences,
            # "cybersecurity_risk_ratio": risk_measure_ratio,
            "cybersecurity_risk_count": risk_measure_count
        }
        output_file_path = os.path.join(output_folder, f"relevant_{file_name}")
        with open(output_file_path, 'w') as output_file:
            json.dump(output_data, output_file, indent=4)

print(f"Relevant sentences and cybersecurity risk measures saved in the folder: {output_folder}")

In [None]:
# Define the folder containing the extracted JSON files
output_folder = "extracted_sentences"

# Load all JSON files into a DataFrame
data = []
for file_name in os.listdir(output_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(output_folder, file_name)
        with open(file_path, 'r') as f:
            file_data = json.load(f)
            # Extract relevant fields
            data.append({
                "cik": file_data.get("cik"),
                "company": file_data.get("company"),
                "filing_date": file_data.get("filing_date"),
                "sic": int(file_data.get("sic", 0)) if file_data.get("sic") else None,
                # "cybersecurity_risk_ratio": file_data.get("cybersecurity_risk_ratio", 0),
                "cybersecurity_risk_count": file_data.get("cybersecurity_risk_count", 0)
            })

# Convert to DataFrame
df = pd.DataFrame(data)

# Add a "year" column
df['year'] = pd.to_datetime(df['filing_date']).dt.year

# Define SIC code ranges for industries
sic_ranges = {
    "Agriculture, Forestry and Fishing": (1, 999),
    "Mining": (1000, 1499),
    "Construction": (1500, 1799),
    "Manufacturing": (2000, 3999),
    "Transportation and other Utilities": (4000, 4999),
    "Wholesale Trade": (5000, 5199),
    "Retail Trade": (5200, 5999),
    "Finance, Insurance and Real Estate": (6000, 6799),
    "Services": (7000, 8999),
    "Public Administration": (9000, 9999)
}

# Function to map SIC codes to industries
def map_sic_to_industry(sic):
    if pd.isnull(sic):  # Handle missing SIC codes
        return "Unknown"
    for industry, (low, high) in sic_ranges.items():
        if low <= sic <= high:
            return industry
    return "Unknown"  # Handle SIC codes outside the defined ranges

# Map SIC codes to industries
df['industry'] = df['sic'].apply(map_sic_to_industry)

# Debugging: Check unique industries
print("Unique industries:", df['industry'].unique())

# Part 1: Compute Descriptive Statistics by Industry
descriptive_stats = []
for industry, group in df.groupby('industry'):
    stats = {
        "industry": industry,
        "N": len(group),
        "mean": group['cybersecurity_risk_count'].mean(),
        "std_dev": group['cybersecurity_risk_count'].std(),
        "skewness": skew(group['cybersecurity_risk_count']),
        "kurtosis": kurtosis(group['cybersecurity_risk_count']),
        "min": group['cybersecurity_risk_count'].min(),
        "max": group['cybersecurity_risk_count'].max(),
        "1%": group['cybersecurity_risk_count'].quantile(0.01),
        "5%": group['cybersecurity_risk_count'].quantile(0.05),
        "25%": group['cybersecurity_risk_count'].quantile(0.25),
        "50%": group['cybersecurity_risk_count'].quantile(0.50),
        "75%": group['cybersecurity_risk_count'].quantile(0.75),
        "95%": group['cybersecurity_risk_count'].quantile(0.95),
        "99%": group['cybersecurity_risk_count'].quantile(0.99)
    }
    descriptive_stats.append(stats)

# Convert descriptive stats to DataFrame
descriptive_stats_df = pd.DataFrame(descriptive_stats)

# Save descriptive stats to a CSV file
descriptive_stats_df.to_csv("descriptive_stats_by_industry.csv", index=False)

# Part 2: Compute Mean and Standard Deviation by Industry and Year
mean_std_by_year = df.groupby(['industry', 'year'])['cybersecurity_risk_count'].agg(['mean', 'std']).reset_index()

# Save mean and std deviation by year to a CSV file
mean_std_by_year.to_csv("mean_std_by_industry_year.csv", index=False)

# Debugging: Check the grouped DataFrame
print("Grouped DataFrame:")
print(mean_std_by_year.head())

# Part 3: Plot Trends Over Time
plt.figure(figsize=(12, 8))
sns.lineplot(data=mean_std_by_year, x='year', y='mean', hue='industry', marker='o')
plt.title("Mean Cybersecurity Risk Count by Industry (2015-2023)")
plt.xlabel("Year")
plt.ylabel("Mean Cybersecurity Risk Count")
plt.legend(title="Industry", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig("cybersecurity_risk_trends.png")
plt.show()

[GDrive Link to access Plot Figure, CSV files for Descriptive Statistics & Mean + Standard Deviation](https://drive.google.com/drive/folders/1vqm09ZhhGAJCAwt-GFz4i_SKuJ9bwYGU?usp=sharing)