In [22]:
from google.colab import userdata

# Install Git
!apt-get install git

# Configure Git
gmail = userdata.get('GITHUB_EMAIL')
!git config --global user.name "andrew-dragoslavic"
!git config --global user.email {gmail}

# Clone the existing repo (instead of initializing a new one)
repo_name = "ApplicationTracker"
username = "andrew-dragoslavic"
!git clone https://github.com/{username}/{repo_name}.git
%cd {repo_name}

# Copy the updated notebook
notebook_name = "Application.ipynb"
!cp "/content/drive/My Drive/Colab Notebooks/{notebook_name}" .

# Add and commit
!git add {notebook_name}
!git commit -m "Update: Add latest application tracking notebook"

# Push using PAT

token = userdata.get('GITHUB_TOKEN')
!git push https://{username}:{token}@github.com/{username}/{repo_name}.git master

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
/content/ApplicationTracker/ApplicationTracker/ApplicationTracker/ApplicationTracker
[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/ApplicationTracker/ApplicationTracker/ApplicationTracker/ApplicationTracker/.git/
Application.ipynb  DeepSort_YOLO.ipynb	Pothole
If you don’t see your notebook above,

In [None]:
!pip install transformers torch accelerate bitsandbytes bs4

In [None]:
import imaplib
import email
from email import policy
from bs4 import BeautifulSoup
import re
from datetime import datetime
from transformers import pipeline
import torch
from transformers import BitsAndBytesConfig
from google.colab import userdata

In [None]:
HF_TOKEN = userdata.get('HF_TOKEN')

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

grok_pipeline = pipeline(
    "text-generation",
    model=model_name,  # GPU
    torch_dtype=torch.float16,
    model_kwargs={"quantization_config": quantization_config},
    token=HF_TOKEN  # Optional, only if you’re logged in
)

In [None]:
def fetch_emails(server, username, password, start_date, mailbox="INBOX", num_emails=None):
    """Fetch raw emails from the Primary section of the Gmail inbox starting from a specific date."""
    mail = imaplib.IMAP4_SSL(server)
    mail.login(username, password)
    mail.select(mailbox)

    if isinstance(start_date, datetime):
        start_date = start_date.strftime("%d-%b-%Y")
    try:
        datetime.strptime(start_date, "%d-%b-%Y")
    except ValueError:
        raise ValueError("start_date must be in 'dd-Mon-yyyy' format, e.g., '01-Jan-2025'")

    print(f"Searching with date: {start_date}")
    status, messages = mail.search(None, f'SINCE {start_date}', '(X-GM-RAW "category:primary")')
    if status != "OK":
        raise Exception(f"Failed to search emails: {messages}")

    email_ids = messages[0].split()
    print(f"Found {len(email_ids)} email IDs")
    if not email_ids:
        print(f"No Primary emails found since {start_date}")
        mail.logout()
        return []

    if num_emails:
        email_ids = email_ids[-num_emails:]
        print(f"Limiting to {len(email_ids)} emails")

    raw_emails = []
    for email_id in email_ids:
        status, msg_data = mail.fetch(email_id, "(RFC822)")
        if status != "OK":
            continue
        raw_email = msg_data[0][1].decode("utf-8", errors="ignore")
        raw_emails.append(raw_email)

    mail.logout()
    print(f"Fetched {len(raw_emails)} emails")
    return raw_emails

In [None]:
def parse_email(raw_email):
    """Parse a raw email string into sender, subject, and body with cleaned-up text."""
    msg = email.message_from_string(raw_email, policy=policy.default)
    sender = msg.get("From")
    subject = msg.get("Subject")
    body = None
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            if content_type == "text/plain":
                body = part.get_payload(decode=True).decode("utf-8", errors="ignore")
                body = re.sub(r'\s+', ' ', body).strip()
                break
            elif content_type == "text/html":
                html_body = part.get_payload(decode=True).decode("utf-8", errors="ignore")
                soup = BeautifulSoup(html_body, "html.parser")
                body = soup.get_text(separator=" ")
                body = re.sub(r'\s+', ' ', body).strip()
                break
    else:
        content_type = msg.get_content_type()
        if content_type == "text/plain":
            body = msg.get_payload(decode=True).decode("utf-8", errors="ignore")
            body = re.sub(r'\s+', ' ', body).strip()
        elif content_type == "text/html":
            html_body = msg.get_payload(decode=True).decode("utf-8", errors="ignore")
            soup = BeautifulSoup(html_body, "html.parser")
            body = soup.get_text(separator=" ")
            body = re.sub(r'\s+', ' ', body).strip()
    return sender, subject, body

In [None]:
def process_email_with_grok(sender, subject, body):
    """Use Mixtral-8x7B-Instruct to classify and summarize an email."""
    prompt = (
    "[INST] Determine if this email is a response to a job application I sent to the specified company. "
    "If it is, return the result in this exact format: 'Company: [company], Job Position: [position], Status: [status]', "
    "where Status must be one of these four options only: 'received', 'interviewing', 'rejected', or 'job offer'. "
    "If it is not, return exactly this: 'Not a job application email'. "
    "Note: Emails from companies like Glassdoor or LinkedIn about new job postings are not responses to job applications; classify those as 'Not a job application email'. "
    "Do not include any additional explanation or deviation from these formats.\n\n"
    f"Sender: {sender}\nSubject: {subject}\nBody: {body} [/INST]"
    )

    try:
        result = grok_pipeline(prompt, max_new_tokens=1000, do_sample=False, temperature=0.5)[0]["generated_text"]
        # Extract response after the prompt
        response = result[len(prompt):].strip()
        return response
    except Exception as e:
        print(f"Processing error: {e}")
        return "Error processing email"

In [None]:
import nltk
nltk.download('punkt')  # Download the sentence tokenizer
nltk.download('punkt_tab')

import pandas as pd

def extract_final_output(output):
    # Split on </think> and take the last part (the actual output)
    parts = output.split('</think>')
    if len(parts) > 1:
        final_output = parts[-1].strip()  # Get text after </think>, remove leading/trailing whitespace
    else:
        final_output = output.strip()  # Fallback if no </think> tag

    print("Full Response:", output)
    print("Extracted Output:", final_output)

    # Parse the final output
    if final_output.startswith("Company:"):
        match = re.match(r"Company: (.+?), Job Position: (.+?), Status: (.+)", final_output)
        if match:
            company, position, status = match.groups()
            return {"Company": company, "Job Position": position, "Status": status}
    elif final_output == "Not a job application email":
        return None
    return None  # Default for unexpected format


In [None]:
!pip install nltk pandas openpyxl

In [None]:
username = userdata.get('email')
password = userdata.get('password')

raw_emails = fetch_emails(
    server="imap.gmail.com",
    username=username,
    password=password,
    start_date="20-Mar-2025",
    num_emails=None
)

all_data = []
for i, raw_email in enumerate(raw_emails, 1):
    sender, subject, body = parse_email(raw_email)
    print(f"Email {i}:")
    print(f"Sender: {sender}")
    print(f"Subject: {subject}")
    print(f"Body: {body}")

    result = process_email_with_grok(sender, subject, body)
    # Treat result as a single string, not an iterable
    extracted = extract_final_output(result)
    if extracted is not None:
        all_data.append(extracted)

    print("Extracted Data (this email):", extracted)
    print("-" * 50)

# Create DataFrame from all data
df = pd.DataFrame(all_data)
print("Final DataFrame:\n", df)

# excel_filename = "job_applications.xlsx"
# df.to_excel(excel_filename, index=False)

# # Download
# from google.colab import files
# files.download(excel_filename)