In [1]:
from __future__ import print_function
import os.path
import re
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Define the scope for Gmail API access (readonly).
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [3]:
def authenticate_gmail():
    """Authenticate and return a Gmail service instance."""
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    else:
        flow = InstalledAppFlow.from_client_secrets_file('client_secret.json', SCOPES)
        creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

In [4]:
def get_unread_emails(service):
    """Get all unread emails."""
    try:
        results = service.users().messages().list(userId='me', q='is:unread').execute()
        messages = results.get('messages', [])

        if not messages:
            print("No unread emails found.")
            return []

        unread_emails = []
        for msg in messages:
            message = service.users().messages().get(userId='me', id=msg['id'], format='full').execute()
            subject = next(header['value'] for header in message['payload']['headers'] if header['name'] == 'Subject')
            snippet = message.get('snippet', '')
            link = f"https://mail.google.com/mail/u/3/#inbox/{msg['id']}"  # Create link to email
            unread_emails.append({'subject': subject, 'snippet': snippet, 'link': link, 'id': msg['id']})

        return unread_emails

    except Exception as error:
        print(f'An error occurred: {error}')
        return []

In [5]:
def clean_email_content(content):
    # Remove typical introductory phrases or sections
    content = re.sub(r'(?:From|Sent|To|Subject|Date|Regards|Sincerely|Best regards|On \w+day, \d{1,2} \w+ \d{4}).*$', '', content, flags=re.IGNORECASE)
    content = re.sub(r'\n\s*\n', '\n', content)  # Remove extra newlines
    return content.strip()

In [6]:
def extract_keywords(text, num_keywords=5):
    """Extract keywords from the given text."""
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    # Get the highest scoring keywords
    scores = X.sum(axis=0).A1
    top_keywords = sorted(zip(scores, keywords), reverse=True)[:num_keywords]
    return [keyword for score, keyword in top_keywords]

In [7]:
def summarize_email(email_content):
    """Summarize the email content using a Hugging Face model."""
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(email_content, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']

In [8]:
if __name__ == '__main__':
    # Authenticate and create the Gmail service
    service = authenticate_gmail()
    
    # Get and process unread emails
    unread_emails = get_unread_emails(service)
    
    for email in unread_emails:
        email_content = email['snippet']  # Assuming snippet is enough; otherwise, you may need to fetch the full content
        cleaned_content = clean_email_content(email_content)
        
        if cleaned_content:
            summary = summarize_email(cleaned_content)
            keywords = extract_keywords(cleaned_content)

            print(f"Subject: {email['subject']}")
            print(f"Summary: {summary}")
            print(f"Keywords: {keywords}")
            print(f"Link: {email['link']}\n")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Subject: Re: Regarding 4 Credit Project Elective for the Upcoming Semester
Summary: I have a project. Dear Students, I have aproject. I'm working on a project for my students. Please help me with the project. Please email me at jennifer.smith@mailonline.co.uk.
Keywords: ['students', 'project', 'dear']
Link: https://mail.google.com/mail/u/3/#inbox/192f6a090d899594



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 58. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)


Subject: T1-24-25-DAS 703: Updating Project Group and Title in Googledoc
Summary: Updating Project Group and Title in Googledoc by Prof. Uttam Kumar - Monday, 4. T1-24-25-DAS 703 » Forums » Announcements » Updating Project group and title in Go Googleledoc
Keywords: ['uttam', 'updating', 'title', 'project', 'prof']
Link: https://mail.google.com/mail/u/3/#inbox/192f618c87eff600



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Subject: T1-24-CSE 303: Project Deliverable 1
Summary: Project Deliverable 1 by IMT2021082 Vidhu Arora - Monday, 4 November 2024, 12:05 PM. T1-24-CSE 303 » Forums » Announcements » Project deliverable 1.
Keywords: ['vidhu', 'project', 'imt2021082', 'deliverable', 'arora']
Link: https://mail.google.com/mail/u/3/#inbox/192f5e8496928362



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


Subject: Microsoft is hiring Technical Program Manager Intern at INR 40,000/Month!
Summary: CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Visit CNN.com/Travel each week for a new gallery of snapshots. Click here for the gallery.
Keywords: ['tap']
Link: https://mail.google.com/mail/u/3/#inbox/192f5a1b4889ee32



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 150, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


Subject: Samvaad talk on "An empirical analysis of prices of radio spectrum for mobile communication services" by Dr. V. Sridhar
Summary: This is a gentle reminder for the weekly Samvaad Talk happening on the blog. Greetings everyone, This is a Gentle reminder for  the weekly SAMVAAD Talk happening online. This week, we will be talking about the recent election in the U.S.
Keywords: ['weekly', 'talk', 'samvaad', 'reminder', 'happening']
Link: https://mail.google.com/mail/u/3/#inbox/192f59c27e45ae03

