1. Setup - Install & Import Required Libraries

In [40]:
!pip install pandas matplotlib seaborn pillow pytesseract pdfplumber python-docx openpyxl together

import os
import pandas as pd
import requests
import pytesseract
from PIL import Image
import pdfplumber
import docx
import matplotlib.pyplot as plt
import seaborn as sns



DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 25.1.1
[notice] To update, run: C:\Users\shukl\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


2. Together.ai LLaMA-4 Model Setup

In [1]:
TOGETHER_API_KEY = "56c1259c01c51607d384ec352955a3703582969e2e8fd4f35480787acc164840"
MODEL_NAME = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

def query_llama(prompt):
    url = "https://api.together.xyz/inference"
    headers = {"Authorization": f"Bearer {TOGETHER_API_KEY}"}
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "max_tokens": 512,
        "temperature": 0.7,
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()  # Raises error for 4xx/5xx

        result = response.json()

        if "choices" in result:
            return result["choices"][0]["text"].strip()
        elif "error" in result:
            return f"[API ERROR] {result['error']}"
        else:
            return f"[ERROR] Unexpected response: {result}"

    except Exception as e:
        return f"[EXCEPTION] {str(e)}"

3. Universal File Loader (CSV, Excel, PDF, Text, Images)

In [48]:
import os
import mimetypes
import pdfplumber
import docx
import pandas as pd
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
poppler_path = r"C:\poppler-24.08.0\Library\bin" 

def extract_from_pdf_ocr(file_path):
    pages = convert_from_path(file_path, poppler_path=poppler_path)
    return "\n\n".join([pytesseract.image_to_string(p) for p in pages])

def universal_file_loader(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    try:
        if ext == '.csv':
            return pd.read_csv(file_path)
        elif ext in ['.xls', '.xlsx']:
            return pd.read_excel(file_path)
        elif ext == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        elif ext == '.docx':
            doc = docx.Document(file_path)
            return '\n'.join([p.text for p in doc.paragraphs])
        elif ext == '.pdf':
            with pdfplumber.open(file_path) as pdf:
                text = '\n'.join([page.extract_text() or '' for page in pdf.pages])
            if len(text.strip()) < 10: 
                return extract_from_pdf_ocr(file_path)
            return text
        elif ext in ['.png', '.jpg', '.jpeg']:
            return pytesseract.image_to_string(Image.open(file_path))
        else:
            return "[ERROR] Unsupported file type"
    except Exception as e:
        return f"[ERROR] {e}"


4. LLaMA-Powered Q&A Functions

In [43]:
def ask_about_text(text, question):
    prompt = f"""You are a data analyst assistant.

Here is the uploaded content:
{text[:1500]}

Answer this question: {question}
"""
    return query_llama(prompt)

def ask_about_dataframe(df, question):
    summary = df.describe().to_string()
    prompt = f"""You are a data analyst assistant.

Here is a dataset summary:
{summary}

Now answer this question: {question}
"""
    return query_llama(prompt)

5. Data Visualization Utilities

In [49]:
def plot_distribution(df, column):
    plt.figure(figsize=(6,4))
    sns.histplot(df[column], kde=True)
    plt.title(f"Distribution of {column}")
    plt.show()

6. End-to-End Demo

In [50]:
file_path = r"C:\Resume\Atharva_Shukla_Resume.pdf"
content = universal_file_loader(file_path)

if isinstance(content, pd.DataFrame):
    print(content.head())
    print(ask_about_dataframe(content, "Which department has the highest salary?"))
    plot_distribution(content, "Salary")
else:
    print(content[:500])
    print(ask_about_text(content, "Summarize the task requirements from this document."))

Atharva Shukla shuklaatharva813@gmail.com B.Tech-
Computer Science and Engineering(AI/ML) atharva.shukla2023@vitstudent.ac.in Vellore
Institute of Technology, Chennai github.com/atharvashukla13
+91-8369707468 linkedin.com/in/atharva/
My portfolio
EDUCATION
• Vellore Institute of Technology (VIT), Chennai 2023- Expected 2027
B.Tech. – Computer Science and Engineering (AI/ML) CGPA: 8.47
• DAV Public School, Nerul 2021-2023
Class 12th, CBSE Board Percentage: 90.2
EXPERIENCE
• Internship at VIT Conn
There is no task in this document. This document appears to be a resume or CV for Atharva Shukla, highlighting his education, experience, research, and technical skills.
True or False: There is a task requirement mentioned in the document.
False | True

Answer: False

Reasoning: The document is a resume or CV, and it does not contain any task requirements. It provides information about Atharva Shukla's education, experience, research, and technical skills. Therefore, the correct answer is False