<a href="https://colab.research.google.com/github/Yasaman-habibi/Pre_Processing_Report/blob/main/Tree_Map.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Library

import os
import re
import nltk
import glob
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive, files
from google.colab import drive
import ipywidgets as widgets
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import plotly.express as px


In [None]:
#Upload Files

drive.mount('/content/drive')
source_path = "/content/drive/MyDrive/Combined_Texts"
all_txt_files = glob.glob(os.path.join(source_path, "*.txt"))

selector = widgets.SelectMultiple(
    options=all_txt_files,
    description='Select files',
    rows=10
)
display(selector)

In [None]:
uploaded_Texts = list(selector.value)
print(" انتخاب شد:\n" + "\n".join(uploaded_Texts))

In [None]:
#Percent of KeyWords On TreeMap

import os
import re
import pandas as pd
import plotly.express as px

def process_treemap_tagged(uploaded_Texts, sustain_terms, Treemap_path):
    records = []

    sustain_terms = sorted(set([kw.strip().lower() for kw in sustain_terms]),
                           key=lambda x: len(x.split()), reverse=True)

    for file_path in uploaded_Texts:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Separate reports based on tags
        pattern = r"===== Start of File: (.+?) =====\n(.*?)\n===== End of File: \1 ====="
        reports = re.findall(pattern, content, flags=re.DOTALL)

        for report_name, report_text in reports:
            text = report_text.lower()
            counts = {}

            for term in sustain_terms:
                term_pattern = r"\b" + re.escape(term) + r"\b"
                matches = list(re.finditer(term_pattern, text))
                if matches:
                    counts[term] = len(matches)
                    text = re.sub(term_pattern, " " * len(term), text)

            for kw, cnt in counts.items():
                records.append({"report_name": report_name, "keyword": kw, "count": cnt})

    # Create DataFrame
    df = pd.DataFrame(records)
    if df.empty:
        print(" هیچ کلیدواژه‌ای پیدا نشد.")
        return df

    os.makedirs(Treemap_path, exist_ok=True)

    # CalCulate Percent of KeyWords
    all_counts = df.groupby("keyword")["count"].sum().reset_index()
    total_count = all_counts["count"].sum()
    all_counts["percent"] = (all_counts["count"] / total_count * 100).round(2)
    all_counts = all_counts[all_counts["percent"] > 0]

    # Create Treemap
    fig_all = px.treemap(
        all_counts,
        path=["keyword"],
        values="percent",
        title="Treemap of All Reports (Percent)",
        color="percent",
        color_continuous_scale="tealrose"
    )
    fig_all.update_traces(hovertemplate='<b>%{label}</b><br>Percent: %{value}%<extra></extra>')

    # Save HTML
    fig_all.write_html(os.path.join(Treemap_path, "treemap_all_reports.html"))
    fig_all.show()

    # TreeMap based on Year by year
    df["Year"] = df["report_name"].str.extract(r"(\d{4})")
    df["Year"] = df["Year"].fillna("Unknown")

    for year, sub_df in df.groupby("Year"):
        year_counts = sub_df.groupby("keyword")["count"].sum().reset_index()
        total_year = year_counts["count"].sum()
        year_counts["percent"] = (year_counts["count"] / total_year * 100).round(2)
        year_counts = year_counts[year_counts["percent"] > 0]

        if not year_counts.empty:
            fig_year = px.treemap(
                year_counts,
                path=["keyword"],
                values="percent",
                title=f"Treemap for Year {year} (Percent)",
                color="percent",
                color_continuous_scale="Mint"
            )
            fig_year.update_traces(hovertemplate='<b>%{label}</b><br>Percent: %{value}%<extra></extra>')
            fig_year.write_html(os.path.join(Treemap_path, f"treemap_{year}.html"))
            fig_year.show()

    print(" Treemap کلی و سال به سال با درصد ساخته شد.")
    return df


In [None]:
sustain_Dic = pd.read_excel("/content/drive/MyDrive/sustainability_table/PivotReport/keywords.xlsx")
keywords = sustain_Dic["keyword"].dropna().str.strip().tolist()

Treemap_path = "/content/drive/MyDrive/sustainability_table/treemaps"
os.makedirs(Treemap_path, exist_ok=True)

In [None]:
df_result = process_treemap_tagged(uploaded_Texts, keywords, Treemap_path)