AI and Non AI Sentences from AI Seed Words

In [3]:
# !pip install pandas tqdm nltk unidecode

import os
import re
import random
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from unidecode import unidecode

nltk.download('punkt')

# === STEP 1: Load keywords from ai_seedwords.csv ===
csv_path = 'ai_seedwords.csv'
raw_df = pd.read_csv(csv_path)

# Convert wide format to long format: category | keyword
keywords_df = raw_df.melt(var_name='category', value_name='keyword').dropna()
keywords_df['keyword'] = keywords_df['keyword'].astype(str).str.strip()
keywords_df['category'] = keywords_df['category'].astype(str).str.strip()
ai_keywords = set(k.lower() for k in keywords_df['keyword'])

# === STEP 2: Extract all sentences from txt files ===
text_folder = "txt_reports"
all_sentences = []
ai_sentences = []

def is_clean_sentence(sent):
    words = sent.split()
    if len(words) < 5:
        return False
    # Remove if mostly digits/symbols
    alpha_ratio = sum(char.isalpha() for char in sent) / max(1, len(sent))
    return alpha_ratio >= 0.5

for filename in tqdm(os.listdir(text_folder), desc="Processing reports"):
    if filename.endswith(".txt"):
        filepath = os.path.join(text_folder, filename)
        try:
            with open(filepath, 'r', encoding='utf-8-sig', errors='ignore') as f:
                text = f.read()
        except UnicodeDecodeError:
            with open(filepath, 'r', encoding='latin-1', errors='ignore') as f:
                text = f.read()

        text = unidecode(text)
        clean_text = re.sub(r'\s+', ' ', text.strip())
        sentences = sent_tokenize(clean_text)
        all_sentences.extend(sentences)

        # Extract AI sentences
        for sentence in sentences:
            sent_lower = sentence.lower()
            if any(keyword in sent_lower for keyword in ai_keywords):
                ai_sentences.append(sentence.strip())

# === STEP 3: Deduplicate AI sentences ===
ai_sentences = list(set(ai_sentences))
num_ai = len(ai_sentences)
print(f"✅ Found {num_ai} unique AI sentences")

# === STEP 4: Clean and filter non-AI sentences ===
non_ai_sentences = []
for sent in all_sentences:
    sent_clean = sent.strip()
    sent_lower = sent_clean.lower()
    if sent_clean not in ai_sentences and not any(k in sent_lower for k in ai_keywords):
        if is_clean_sentence(sent_clean):
            non_ai_sentences.append(sent_clean)

# === STEP 5: Sample same number of non-AI as AI ===
non_ai_sample = random.sample(non_ai_sentences, min(num_ai, len(non_ai_sentences)))
ai_sample     = random.sample(ai_sentences, num_ai)

# === STEP 6: Create and save dataset ===
df_ai = pd.DataFrame({"sentence": ai_sample, "label": 1})
df_non_ai = pd.DataFrame({"sentence": non_ai_sample, "label": 0})

df_combined = pd.concat([df_ai, df_non_ai], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

df_combined.to_csv("bert_training_data.csv", index=False)
print(f"✅ Saved dataset with {num_ai} AI and {len(non_ai_sample)} clean non-AI sentences to bert_training_data.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Processing reports: 100%|██████████| 855/855 [03:55<00:00,  3.62it/s]


✅ Found 1316 unique AI sentences
✅ Saved dataset with 1316 AI and 1316 clean non-AI sentences to bert_training_data.csv


In [1]:
import os
import re
from collections import defaultdict

# Set path to your reports folder
folder_path = "txt_reports"

# Dictionaries to store reports by bank and country
bank_years = defaultdict(set)
country_set = set()

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        # Extract year from filename
        year_match = re.search(r'(20\d{2})', filename)
        if not year_match:
            continue
        year = int(year_match.group(1))

        # Extract bank name (before year)
        bank_name = filename[:year_match.start()].rstrip("_").replace("_", " ").strip()

        # Extract country name (assumed to be first token before underscore)
        country_name = filename.split("_")[0].strip()
        country_set.add(country_name)

        # Store year info per bank
        bank_years[bank_name].add(year)

# Summary statistics
num_banks = len(bank_years)
total_reports = sum(len(years) for years in bank_years.values())
all_years = sorted({y for years in bank_years.values() for y in years})
num_countries = len(country_set)

print(f"🌍 Total Countries: {num_countries}")
print(f"🏦 Total Islamic Banks: {num_banks}")
print(f"📄 Total Annual Reports: {total_reports}")
print(f"📅 Year Range: {min(all_years)} – {max(all_years)}")

# Sample preview
for bank, years in list(bank_years.items())[:5]:
    print(f" - {bank}: {sorted(years)}")


🌍 Total Countries: 25
🏦 Total Islamic Banks: 106
📄 Total Annual Reports: 855
📅 Year Range: 2015 – 2024
 - Afghanistan Afghanistan International Bank: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
 - Bahrain Al Salam Bank: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
 - Bahrain Albaraka: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
 - Bahrain Bahrain Islamic Bank: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
 - Bahrain Bank ABC Islamic: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
