#### Part1: Extracting data
###### It will extract details- city, station, date_column from inside file or filename
###### It will then extract data and save it as df.
###### Summary table is created and saved
###### Hourly average plots are created for selected polluatnst by user.

In [None]:
import xlsxwriter
import pandas as pd
import os
import re
import tkinter as tk
from tkinter import simpledialog
from tkinter import filedialog, messagebox
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageEnhance
import textwrap

# Parameter-Unit Dictionary
units_dict = {
    "PM2.5": "µg/m³", "PM10": "µg/m³", "NO": "µg/m³", "NO2": "µg/m³", "NOx": "ppb",
    "NH3": "µg/m³", "SO2": "µg/m³", "CO": "mg/m³", "Ozone": "µg/m³", "Benzene": "µg/m³",
    "Toluene": "µg/m³", "Xylene": "µg/m³", "O Xylene": "µg/m³", "Eth-Benzene": "µg/m³",
    "MP-Xylene": "µg/m³", "AT": "°C", "Temp": "°C", "RH": "%", "WS": "m/s", "WD": "deg",
    "RF": "mm", "TOT-RF": "mm", "SR": "W/m²", "BP": "mmHg", "VWS": "m/s"
}

# Hide root Tkinter window
root = tk.Tk()
root.withdraw()

# Ask user to choose a file
file_path = filedialog.askopenfilename(
    title="Select Raw Data File",
    filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx")]
)

if not file_path:
    messagebox.showinfo("No File", "No file selected. Exiting.")
    exit()

file_ext = os.path.splitext(file_path)[-1].lower()
filename = os.path.basename(file_path)
file_dir = os.path.dirname(file_path)


if file_ext == ".xlsx":                        # Loading and processing file
    df = pd.read_excel(file_path, header=None)

    station_name, city_name = None, None
    for i in range(15):
        if isinstance(df.iloc[i, 0], str):
            if "Station" in df.iloc[i, 0]:
                station_name = df.iloc[i, 1].split(",")[0]
            if "City" in df.iloc[i, 0]:
                city_name = df.iloc[i, 1]

    header_row = df[df.iloc[:, 0] == "From Date"].index[0]
    df.columns = df.iloc[header_row]
    df = df[header_row + 1:].reset_index(drop=True)

    if not station_name or not city_name:
        parts = filename.split("_")
        if "site" in parts:
            site_index = parts.index("site")
            if site_index + 2 < len(parts):
                station_name = parts[site_index + 2]
        if "CPCB" in parts:
            city_name = parts[site_index + 3] if site_index + 3 < len(parts) else "Unknown"

    date_column = "From Date"
    df = df[["From Date"] + [col for col in df.columns if col not in ["From Date", "To Date"]]]
    df.columns = [re.sub(r"\s*\(.*?\)", "", str(col)) for col in df.columns]
    parameters = [col for col in df.columns if col != "From Date"]

elif file_ext == ".csv":
    df = pd.read_csv(file_path)

    parts = filename.split("_")
    station_name, city_name = None, None
    if "site" in parts:
        site_index = parts.index("site")
        if site_index + 2 < len(parts):
            station_name = parts[site_index + 2]
    if "CPCB" in parts:
        cpcb_index = parts.index("CPCB")
        city_name = parts[cpcb_index - 1]

    date_column = "Timestamp"
    df.columns = [re.sub(r"\s*\(.*?\)", "", str(col)) for col in df.columns]
    parameters = [col for col in df.columns if col != "Timestamp"]

else:
    messagebox.showerror("Error", "Unsupported file format!")
    exit()

# ------- Clean and Analyze Data ----------
df.replace(["None", "NA", "Na"], pd.NA, inplace=True)
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.duplicated()]
df[date_column] = pd.to_datetime(df[date_column], errors="coerce", dayfirst=True)
df = df.dropna(subset=[date_column])

numeric_columns = df.columns.difference([date_column])
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=numeric_columns, how="all")

if df[numeric_columns].shape[1] == 0:
    messagebox.showerror("Error", "No numeric data to summarize.")
    exit()

daily_mean = df.groupby(df[date_column].dt.date).mean(numeric_only=True)

metrics = ["Mean", "Median", "Std. Dev", "Min", "Max"]
summary_data = {"Metrics": metrics}
valid_parameters = [param for param in parameters if param in df.columns]

for param in valid_parameters:
    summary_data[param] = [
        daily_mean[param].mean(),
        df[param].median(),
        daily_mean[param].std(),
        df[param].min(),
        df[param].max()
    ]

summary_df = pd.DataFrame(summary_data)

# ----------Adding units row
unit_row = ["Units"] + [units_dict.get(param, "Unknown") for param in valid_parameters]
unit_df = pd.DataFrame([unit_row], columns=["Metrics"] + valid_parameters)

summary_df = summary_df.reindex(columns=["Metrics"] + valid_parameters, fill_value="Unknown")
summary_df = pd.concat([unit_df, summary_df], ignore_index=True)
summary_df["Metrics"] = ["Units"] + metrics

# ----Save------------------
output_filename = f"{station_name}_{city_name}_SummaryTable.xlsx"
output_path = os.path.join(file_dir, output_filename)

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    summary_df.to_excel(writer, index=False, header=False, startrow=1)
    worksheet = writer.sheets['Sheet1']
    for col_num, value in enumerate(summary_df.columns):
        worksheet.write(0, col_num, value)



# === Save the File ===
output_filename = f"{station_name}_{city_name}_SummaryTable.xlsx"
output_path = os.path.join(file_dir, output_filename)

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    summary_df.to_excel(writer, index=False, header=False, startrow=1)
    worksheet = writer.sheets['Sheet1']
    for col_num, value in enumerate(summary_df.columns):
        worksheet.write(0, col_num, value)

messagebox.showinfo("Success", f"Summary saved as:\n{output_path}")

# === Hourly Average Plotting Section ===
try:
    # Extract date and hour
    df["Date"] = df[date_column].dt.date
    df["Hour"] = df[date_column].dt.hour

    hourly_data_available = df.groupby("Date")["Hour"].nunique().max() > 1

    if not hourly_data_available:
        messagebox.showinfo("No Hourly Data", "Hourly data is not available. Dataset likely has daily values.")
    else:
        # Computing hourly average
        hourly_avg = df.groupby("Hour").mean(numeric_only=True)

        # Asking user to select pollutants for plotting
        pollutants_available = [param for param in parameters if param in df.columns]

        if not pollutants_available:
            messagebox.showwarning("No Pollutants", "No pollutants available for hourly plotting.")
        else:
            selected_pollutants = tk.simpledialog.askstring(
                "Select Pollutants for Hourly Average Plot",
                f"Available pollutants: {', '.join(pollutants_available)}\nEnter pollutant names separated by commas:"
            )

            if selected_pollutants:
                selected_pollutants = [p.strip() for p in selected_pollutants.split(",") if p.strip() in pollutants_available]
                for pollutant in selected_pollutants:
                    plt.figure(figsize=(10, 5))
                    plt.plot(hourly_avg.index, hourly_avg[pollutant], marker="o", linestyle="-", color="b")
                    plt.xticks(range(24), fontsize=12)
                    plt.yticks(fontsize=12)
                    plt.xlabel("Hour of the day", fontsize=14)
                    plt.ylabel(f"{pollutant} ({units_dict.get(pollutant, 'Unknown')})", fontsize=14)
                    plt.title(f"Hourly Average Variation of {pollutant}", fontsize=16)
                    plt.grid(True, linestyle="--", alpha=0.7)

                    # Save plot---------
                    plot_filename = f"{station_name}_{city_name}_{pollutant}_Hourly_Average_Variations.jpg"
                    plot_path = os.path.join(file_dir, plot_filename)
                    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
                    plt.close()

                    print(f"Plot saved: {plot_path}")
except Exception as e:
    messagebox.showerror("Plotting Error", f"An error occurred while generating plots:\n{e}")



#### Part2: Creating Box/Violin Plots
###### It will ask user to draw on hourly average dataset or actual (whole) dataset

In [None]:
import seaborn as sns


# Convert date_time to datetime format
df[date_column] = pd.to_datetime(df[date_column])
df["Date"] = df[date_column].dt.date
df["Hour"] = df[date_column].dt.hour

# Check if hourly data is available
hourly_data_available = df.groupby("Date")["Hour"].nunique().max() > 1

# Tkinter GUI for user inputs
root = tk.Tk()
root.withdraw()

pollutants_available = [param for param in parameters if param in df.columns]

if not pollutants_available:
    print("No pollutants available for plotting.")
else:

    # Box/Violin Distribution Plots
    plot_scope = simpledialog.askstring(
        "Box/Violin Plots",
        "Dataset selection- Enter 'hourly' for plots on hourly distribution data, or 'whole' for plots on entire dataset:"
    )
    if not plot_scope or plot_scope.lower() not in ["hourly", "whole"]:
        print("Invalid input! Defaulting to 'whole'.")
        plot_scope = "whole"
    else:
        plot_scope = plot_scope.lower()

    plot_type = simpledialog.askstring(
        "Plot Type Selection",
        "Enter 'box' for Box Plot or 'violin' for Violin Plot:"
    )
    if not plot_type or plot_type.lower() not in ["box", "violin"]:
        print("Invalid input! Defaulting to 'box'.")
        plot_type = "box"
    else:
        plot_type = plot_type.lower()

    selected_pollutants = simpledialog.askstring(
        "Select Pollutants",
        f"Available pollutants: {', '.join(pollutants_available)}\nEnter 'all' or pollutant names separated by commas:"
    )

    if selected_pollutants:
        if selected_pollutants.strip().lower() == "all":
            selected_pollutants = pollutants_available
        else:
            selected_pollutants = [p.strip() for p in selected_pollutants.split(",") if p.strip() in pollutants_available]

        if not selected_pollutants:
            print("No valid pollutants selected. Exiting.")
        elif plot_scope == "whole":
            if len(selected_pollutants) > 1:
                plt.figure(figsize=(12, 6))
                df_long = df[selected_pollutants].melt(var_name="Pollutant", value_name="Concentration")
                if plot_type == "box":
                    sns.boxplot(x="Pollutant", y="Concentration", data=df_long, palette="Blues")
                    title = "Box Plot"
                else:
                    sns.violinplot(x="Pollutant", y="Concentration", data=df_long, palette="Blues", inner="quartile")
                    title = "Violin Plot"
                plt.xticks(rotation=45, fontsize=12)
                plt.yticks(fontsize=12)
                plt.xlabel("Pollutant", fontsize=14)
                plt.title(f"{title}", fontsize=16)
                plt.grid(True, linestyle="--", alpha=0.7)
                output_filename = f"{station_name}_{city_name}_Pollutants_Whole_{plot_type.capitalize()}_Plot.jpg"
                plt.savefig(os.path.join(os.getcwd(), output_filename), dpi=300, bbox_inches="tight")
                plt.close()
                print(f"{title} saved: {output_filename}")
            else:
                for pollutant in selected_pollutants:
                    plt.figure(figsize=(10, 5))
                    if plot_type == "box":
                        sns.boxplot(y=df[pollutant], palette="Blues")
                        title = "Box Plot"
                    else:
                        sns.violinplot(y=df[pollutant], palette="Blues", inner="quartile")
                        title = "Violin Plot"
                    plt.title(f"{title} of {pollutant}", fontsize=16)
                    plt.ylabel(f"{pollutant} ({units_dict.get(pollutant, 'Unknown')})", fontsize=14)
                    plt.grid(True, linestyle="--", alpha=0.7)
                    plt.xticks([])
                    plt.yticks(fontsize=12)
                    output_filename = f"{station_name}_{city_name}_{pollutant}_Whole_{plot_type.capitalize()}_Plot.jpg"
                    plt.savefig(os.path.join(os.getcwd(), output_filename), dpi=300, bbox_inches="tight")
                    plt.close()
                    print(f"{title} saved: {output_filename}")
        elif plot_scope == "hourly":
            for pollutant in selected_pollutants:
                plt.figure(figsize=(12, 6))
                if plot_type == "box":
                    sns.boxplot(x=df["Hour"], y=df[pollutant], palette="Blues")
                    title = "Hourly Box Plot"
                else:
                    sns.violinplot(x=df["Hour"], y=df[pollutant], palette="Blues", inner="quartile")
                    title = "Hourly Violin Plot"
                plt.xticks(fontsize=12)
                plt.yticks(fontsize=12)
                plt.xlabel("Hour of the Day", fontsize=14)
                plt.ylabel(f"{pollutant} ({units_dict.get(pollutant, 'Unknown')})", fontsize=14)
                plt.title(f"{title} of {pollutant}", fontsize=16)
                plt.grid(True, linestyle="--", alpha=0.7)
                output_filename = f"{station_name}_{city_name}_{pollutant}_Hourly_{plot_type.capitalize()}_Plot.jpg"
                plt.savefig(os.path.join(os.getcwd(), output_filename), dpi=300, bbox_inches="tight")
                plt.close()
                print(f"{title} saved: {output_filename}")


#### Part3 : Creating Line plots

In [None]:

# Hide root tkinter window
root = tk.Tk()
root.withdraw()

# Ensure date_column is datetime and set as index
df[date_column] = pd.to_datetime(df[date_column])
df.set_index(date_column, inplace=True)

# Ask user if they want a line plot
plot_line = simpledialog.askstring(
    "Line Plot Selection",
    "Do you want a line plot? (yes/no)"
)

if plot_line and plot_line.strip().lower() == "yes":
    # Ask user to select pollutants
    selected_pollutants = simpledialog.askstring(
        "Select Pollutants",
        f"Available pollutants: {', '.join(pollutants_available)}\n\nEnter 'all' to plot all pollutants, or specify names separated by commas:"
    )
    
    if selected_pollutants:
        selected_pollutants = selected_pollutants.strip().lower()
        
        if selected_pollutants == "all":
            plt.figure(figsize=(12, 6))
            for pollutant in pollutants_available:
                if pollutant in df.columns:
                    plt.plot(df.index, df[pollutant], marker='o', linestyle='-', label=pollutant)

            plt.xlabel("Time", fontsize=14)
            plt.ylabel("")  # No common y-label
            plt.title("Time Series of All Pollutants", fontsize=16)
            plt.legend(title="Pollutants", fontsize=10)
            plt.grid(True, linestyle="--", alpha=0.7)
            plt.xticks(rotation=45)
            plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%Y"))
            plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())

            output_filename = f"{station_name}_{city_name}_All_Pollutants_Line_Plot.jpg"
            output_path = os.path.join(os.getcwd(), output_filename)
            plt.savefig(output_path, dpi=300, bbox_inches="tight")
            plt.close()
            print(f"Saved: {output_filename}")

                    
        else:
            pollutants_list = [p.strip() for p in selected_pollutants.split(",")]

    # Match user input (case-insensitive) with available columns
            df_columns_lower = {col.lower(): col for col in df.columns}
    
            for user_input in pollutants_list:
                pollutant_key = user_input.lower()
                if pollutant_key in df_columns_lower:
                    pollutant = df_columns_lower[pollutant_key]
                    plt.figure(figsize=(12, 6))
                    plt.plot(df.index, df[pollutant], marker='o', linestyle='-', color='b')

                    plt.xlabel("Time", fontsize=14)
                    plt.ylabel(f"{pollutant} ({units_dict.get(pollutant, 'Unknown')})", fontsize=14)
                    plt.title(f"Time Series of {pollutant}", fontsize=16)
                    plt.grid(True, linestyle="--", alpha=0.7)
                    plt.xticks(rotation=45)
                    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%Y"))
                    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())

                    output_filename = f"{station_name}_{city_name}_{pollutant}_Line_Plot.jpg"
                    output_path = os.path.join(os.getcwd(), output_filename)
                    plt.savefig(output_path, dpi=300, bbox_inches="tight")
                    plt.close()
                    print(f"Saved: {output_filename}")
                else:
                    print(f"Pollutant '{user_input}' not found in data columns.")



#### Part4: Creating Infographics

In [None]:
import os
import sys
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
from tkinter import Tk, filedialog
import textwrap

# Hide main tkinter window
root = Tk()
root.withdraw()

# Step 1: Ask User if Infographic is Required
choice = simpledialog.askstring("Infographic Generator", "Do you want to create an infographic? (yes/no):")
if not choice or choice.strip().lower() != "yes":
    print("❌ Process exited. No infographic created.")
    sys.exit()
if choice != "yes":
    print("❌ Process exited. No infographic created.")
    sys.exit()

# Step 2: Ask for User Inputs
title_text = simpledialog.askstring("Title", "Enter the title:")
sub_title_text = simpledialog.askstring("Subtitle", "Enter the subtitle:")
description_1 = simpledialog.askstring("Description 1", "Enter the first paragraph:")
description_2 = simpledialog.askstring("Description 2", "Enter the second paragraph (Optional):")

# Step 3: Canvas Setup
canvas_width = 1920
canvas_height = 1080
canvas = Image.new("RGBA", (canvas_width, canvas_height), (255, 255, 255, 255))  # White base

# Step 4: Select Background Image (Optional)
print("\n👉 Select a background image (Optional, press Cancel to skip):")
background_path = filedialog.askopenfilename(
    title="Select Background Image",
    filetypes=[("Image Files", "*.png;*.jpg;*.jpeg")]
)

if background_path:
    background = Image.open(background_path).convert("RGBA")
    background = background.resize((canvas_width, canvas_height))
    background = background.filter(ImageFilter.GaussianBlur(radius=5))
    canvas.paste(background, (0, 0))

# Step 5: Select Plot Images (up to 3)
plot_paths = []
for i in range(3):
    print(f"\n👉 Select plot {i + 1} (Optional, press Cancel to skip):")
    plot_file = filedialog.askopenfilename(
        title=f"Select Plot {i+1}",
        filetypes=[("Image Files", "*.png;*.jpg;*.jpeg")]
    )
    if plot_file:
        plot_paths.append(plot_file)

plot_width = int(canvas_width * 0.45)
plot_height = int(canvas_height * 0.25)
plot_positions = [
    (canvas_width // 2 + 20, int(canvas_height * 0.2)),
    (canvas_width // 2 + 20, int(canvas_height * 0.45)),
    (canvas_width // 2 + 20, int(canvas_height * 0.7)),
]

# Paste plots on canvas
for i, plot_path in enumerate(plot_paths):
    plot = Image.open(plot_path).convert("RGBA").resize((plot_width, plot_height))

    # Enhance contrast & edges
    enhancer = ImageEnhance.Contrast(plot)
    plot = enhancer.enhance(2)
    plot = plot.filter(ImageFilter.EDGE_ENHANCE_MORE)

    # Adjust white background transparency
    datas = plot.getdata()
    new_data = [(255, 255, 255, 100) if item[:3] == (255, 255, 255) else item for item in datas]
    plot.putdata(new_data)

    canvas.paste(plot, plot_positions[i], plot)

# Step 6: Add Text
draw = ImageDraw.Draw(canvas)

# Load fonts
try:
    font_title = ImageFont.truetype("arial.ttf", 70)
    font_sub_title = ImageFont.truetype("arial.ttf", 50)
    font_desc = ImageFont.truetype("arial.ttf", 35)
except:
    font_title = font_sub_title = font_desc = ImageFont.load_default()

# Draw Title & Subtitle
draw.text((canvas_width // 4, 50), title_text, fill="red", font=font_title)
draw.text((canvas_width // 4, 140), sub_title_text, fill="darkblue", font=font_sub_title)

# Draw Paragraphs
desc_position_1 = (50, int(canvas_height * 0.3))
wrapped_text_1 = textwrap.fill(description_1, width=50)
text_height = draw.multiline_textbbox((0, 0), wrapped_text_1, font=font_desc)[3]
desc_position_2 = (50, desc_position_1[1] + text_height + 70)

draw.multiline_text(desc_position_1, wrapped_text_1, fill="black", font=font_desc, spacing=10)

if description_2:
    wrapped_text_2 = textwrap.fill(description_2, width=50)
    draw.multiline_text(desc_position_2, wrapped_text_2, fill="black", font=font_desc, spacing=10)

# Step 7: Save Infographic in Script/Executable Directory
output_dir = os.path.dirname(sys.executable if getattr(sys, 'frozen', False) else os.path.abspath(__file__))
output_path = os.path.join(output_dir, "final_infographic.png")
canvas.save(output_path)

print(f"\n✅ Infographic saved as: {output_path}")
