In [1]:
!pip install -q fpdf transformers pandas matplotlib seaborn gradio openpyxl
!apt-get install -y fonts-dejavu

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra
The following NEW packages will be installed:
  fonts-dejavu fonts-dejavu-core fonts-dejavu-extra
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 3,085 kB of archives.
After this operation, 10.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-dejavu all 2.37-2build1 [3,192 B]
Fetched 3,085 kB in 1s (3,700 kB/s)
Selecting previously unselected package fonts-dejavu-core.
(Reading database ... 126284 files 

In [2]:
!pip install -q fpdf
!apt-get install -y fonts-dejavu

from fpdf import FPDF

pdf = FPDF()

# Register fonts with exact custom names
pdf.add_font('DejaVuSans', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', uni=True)
pdf.add_font('DejaVuSans', 'B', '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', uni=True)

pdf.add_page()

# Use Bold
pdf.set_font('DejaVuSans', 'B', 16)
pdf.cell(0, 10, '✅ This is bold DejaVuSans text!', ln=True)

# Use Regular
pdf.set_font('DejaVuSans', '', 14)
pdf.cell(0, 10, 'This is regular DejaVuSans text.', ln=True)

pdf.output('working_dejavu.pdf')

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-dejavu is already the newest version (2.37-2build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


''

In [7]:
import gradio as gr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from fpdf import FPDF
import os
import tempfile
import matplotlib
from transformers import pipeline
import sys

matplotlib.use('Agg')

# ✅ Sentiment pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# ✅ Clean text for PDF compatibility
def clean_text(text):
    return str(text).encode('latin-1', 'ignore').decode('latin-1')

# ✅ Custom PDF class
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", 'B', 14)
        self.cell(200, 10, self.title, ln=True, align='C')
        self.ln(10)

    def chapter_title(self, title):
        self.set_font("Arial", 'B', 12)
        self.cell(0, 10, clean_text(title), ln=True)
        self.ln(4)

    def chapter_body(self, text):
        self.set_font("Arial", '', 11)
        self.multi_cell(0, 10, clean_text(text))
        self.ln()

    def add_plot(self, image_path):
        self.image(image_path, x=10, w=190)
        self.ln(10)

# ✅ Report with basic analysis
def generate_analysis_report(df):
    pdf = PDF()
    pdf.title = "Analysis Report"
    pdf.add_page()

    pdf.chapter_title("1. Dataset Overview")
    pdf.chapter_body(f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}")

    pdf.chapter_title("2. Columns")
    pdf.chapter_body("\n".join(df.columns))

    pdf.chapter_title("3. Missing Values")
    missing = df.isnull().sum()
    missing_str = missing[missing > 0].to_string()
    pdf.chapter_body(missing_str if not missing.empty else "No missing values.")

    pdf.chapter_title("4. Data Types")
    pdf.chapter_body(df.dtypes.to_string())

    pdf.chapter_title("5. Correlation Matrix (Numerical Only)")
    num_df = df.select_dtypes(include=['number'])
    if not num_df.empty:
        corr = num_df.corr().round(2)
        pdf.chapter_body(corr.to_string())
    else:
        pdf.chapter_body("No numerical columns available for correlation analysis.")

    tmp_path = os.path.join(tempfile.gettempdir(), "analysis_report.pdf")
    pdf.output(tmp_path)
    return tmp_path

# ✅ Dashboard with column strengths/weaknesses and charts
def generate_dashboard_report(df):
    pdf = PDF()
    pdf.title = "Dashboard Report"
    pdf.add_page()

    pdf.chapter_title("1. Data Cleaning Summary")
    cleaning_notes = []
    suggestions = []

    for col in df.columns:
        if df[col].isnull().sum() > 0:
            cleaning_notes.append(f"'{col}' has {df[col].isnull().sum()} missing values.")
        if df[col].nunique() == 1:
            suggestions.append(f"'{col}' is constant.")
        if df[col].dtype == 'object' and df[col].astype(str).str.strip().isin(['', 'NA', 'null']).any():
            cleaning_notes.append(f"'{col}' may contain 'NA', 'null', or empty strings.")

    pdf.chapter_body("\n".join(cleaning_notes + suggestions) if (cleaning_notes or suggestions) else "No major issues found.")

    pdf.chapter_title("2. Column Strengths & Weaknesses")
    strengths = []
    weaknesses = []

    for col in df.columns:
        unique_vals = df[col].nunique()
        if unique_vals > df.shape[0] * 0.9:
            weaknesses.append(f"'{col}' has too many unique values — likely an ID or noise.")
        elif unique_vals == 1:
            weaknesses.append(f"'{col}' has only one unique value — no variability.")
        elif unique_vals < 5:
            strengths.append(f"'{col}' has few categories — good for grouping.")
        elif df[col].dtype in ['int64', 'float64']:
            strengths.append(f"'{col}' is numeric — good for trends/aggregation.")

    explanation = []
    if strengths:
        explanation.append("✔️ Strong Columns:")
        explanation.extend([f"{s} Why: {s.split('—')[-1].strip()}" for s in strengths])
    if weaknesses:
        explanation.append("❌ Weak Columns:")
        explanation.extend([f"{w} Why: {w.split('—')[-1].strip()}" for w in weaknesses])

    pdf.chapter_body("\n".join(explanation) if explanation else "No standout strengths or weaknesses.")

    pdf.chapter_title("3. Visual Charts (Histogram)")
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        for col in numeric_cols:
            try:
                plt.figure(figsize=(6, 4))
                sns.histplot(df[col].dropna(), kde=True, bins=20)
                plt.title(f"Distribution of {col}")
                img_path = os.path.join(tempfile.gettempdir(), f"hist_{col}.png")
                plt.savefig(img_path)
                plt.close()
                pdf.add_plot(img_path)
            except Exception:
                continue
    else:
        pdf.chapter_body("No numeric columns available for histogram plotting.")

    tmp_path = os.path.join(tempfile.gettempdir(), "dashboard_report.pdf")
    pdf.output(tmp_path)
    return tmp_path

# ✅ Process uploaded file
def process(file):
    try:
        df = pd.read_csv(file.name) if file.name.endswith(".csv") else pd.read_excel(file.name)

        df.columns = df.columns.str.strip()

        # Try to convert object to numeric where possible
        for col in df.select_dtypes(include='object').columns:
            df[col] = pd.to_numeric(df[col], errors='ignore')

        # Sentiment analysis on sample texts
        text_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() < 1000]
        for col in text_cols:
            try:
                sample_texts = df[col].dropna().astype(str).sample(min(5, len(df)))
                sentiments = sentiment_analyzer(sample_texts.tolist())
                sentiment_series = pd.Series(data=[s['label'] for s in sentiments], index=sample_texts.index)
                df[f"{col}_sentiment"] = np.nan
                df.loc[sentiment_series.index, f"{col}_sentiment"] = sentiment_series
            except Exception:
                continue

        analysis_pdf = generate_analysis_report(df)
        dashboard_pdf = generate_dashboard_report(df)

        return "✅ Success", analysis_pdf, dashboard_pdf
    except Exception as e:
        return f"❌ Error: {str(e)}", None, None

# ✅ Gradio UI
with gr.Blocks() as demo:
    with gr.Row():
        file_input = gr.File(label="Upload CSV or Excel (max 50k rows)")
        submit_btn = gr.Button("Submit")

    with gr.Row():
        status = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        analysis_report = gr.File(label="Analysis Report (PDF)", interactive=False)
        dashboard_report = gr.File(label="Dashboard Report (PDF)", interactive=False)

    submit_btn.click(fn=process, inputs=[file_input], outputs=[status, analysis_report, dashboard_report])

# ✅ Colab-compatible launch
if "google.colab" in sys.modules:
    demo.launch(share=True, inline=False)
else:
    demo.launch()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1781f565700063ad2d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
