In [2]:
import json
import re
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import zipfile

In [15]:
import json
import re
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import zipfile

def calculate_stats(input_dir, output_dir, zip_filename):
    os.makedirs(output_dir, exist_ok=True)

    def count_words(text):
        return len(re.findall(r'\S+', text)) if isinstance(text, str) else 0

    def process_dataset(data):
        total_words = 0
        total_records = 0
        field_lengths = defaultdict(list)

        def walk(value, field_path=''):
            nonlocal total_words, total_records
            if isinstance(value, str):
                words = count_words(value)
                total_words += words
                field_lengths[field_path].append(words)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        total_records += 1
                    walk(item, field_path)
            elif isinstance(value, dict):
                for key, val in value.items():
                    new_field = f"{field_path}.{key}" if field_path else key
                    walk(val, new_field)

        walk(data)
        return total_records, total_words, field_lengths

    print("\n" + "="*60)
    print("📊 ANALYZING PROVINCE DATASETS".center(60))
    print("="*60)

    for filename in os.listdir(input_dir):
        if not filename.endswith('.json'):
            continue

        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        province = os.path.splitext(filename)[0]
        safe_province = province.replace(" ", "_").replace("/", "_")

        rec_count, word_count, field_lengths = process_dataset(data)

        print("\n" + "-"*60)
        print(f"📍 Province: {province}")
        print(f"🧾 Total records: {rec_count}")
        print(f"📝 Total words:   {word_count}")
        print("📐 Average words per field:")
        print("-"*60)

        field_names = []
        average_lengths = []

        max_field_len = max((len(field) for field in field_lengths), default=0)
        for field, lengths in field_lengths.items():
            avg = sum(lengths) / len(lengths)
            field_names.append(field)
            average_lengths.append(avg)
            print(f"  • {field.ljust(max_field_len)} : {avg:.2f}")

        plt.figure(figsize=(12, 6))
        plt.barh(field_names, average_lengths)
        plt.xlabel('Average Word Count')
        plt.title(f'Average Word Count per Field - {province}')
        plt.tight_layout()
        plt.grid(axis='x')

        output_path = os.path.join(output_dir, f'{safe_province}_field_word_counts.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"\n✅ Plot saved: {output_path}")

    zip_filepath = os.path.join(output_dir, zip_filename)
    with zipfile.ZipFile(zip_filepath, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                if file.endswith('.png') and file != zip_filename:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, output_dir)
                    zipf.write(file_path, arcname)

    print("\n" + "="*60)
    print("📦 ZIP ARCHIVE CREATED".center(60))
    print("="*60)
    print(f"🗂️  All plots zipped into: {zip_filepath}")
    print("="*60 + "\n")

In [16]:
calculate_stats(input_dir='datasets_method1', output_dir='plots_method1', zip_filename='plots_method1.zip')


               📊 ANALYZING PROVINCE DATASETS                

------------------------------------------------------------
📍 Province: Hormozgan
🧾 Total records: 76
📝 Total words:   218
📐 Average words per field:
------------------------------------------------------------
  • province                         : 1.00
  • title                            : 3.00
  • location.province                : 1.00
  • location.city                    : 2.00
  • geographical_features.name       : 1.56
  • geographical_features.items.name : 2.00
  • vegetation                       : 1.14
  • topography.name                  : 3.00
  • topography.description           : 9.00
  • tourist_attractions.name         : 2.00
  • tourist_attractions.year_built   : 0.00
  • tourist_attractions.constructor  : 0.00
  • tourist_attractions.architect    : 0.00
  • tourist_attractions.description  : 13.50
  • additional_info.books_source     : 0.00

✅ Plot saved: plots_method1/Hormozgan_field_word_counts.png

--

In [17]:
calculate_stats(input_dir='datasets_method2', output_dir='plots_method2', zip_filename='plots_method2.zip')


               📊 ANALYZING PROVINCE DATASETS                

------------------------------------------------------------
📍 Province: Hormozgan
🧾 Total records: 37
📝 Total words:   236
📐 Average words per field:
------------------------------------------------------------
  • title                                  : 3.00
  • location.province                      : 1.00
  • location.city                          : 1.00
  • geographical_features.name             : 1.25
  • geographical_features.description.name : 1.47
  • geographical_features.description      : 6.00
  • topography.name                        : 1.25
  • topography.description                 : 3.27
  • natural_resources.name                 : 2.00
  • natural_resources.description          : 2.70
  • tourist_attractions.name               : 3.57
  • tourist_attractions.description        : 9.14
  • tourist_attractions.year_built         : 1.00
  • tourist_attractions.constructor        : 2.00
  • tourist_attractions.a