In [8]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [9]:
from __future__ import print_function

import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [10]:
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.modify']
# SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [11]:
creds = None

if os.path.exists('token.json'):
    creds = Credentials.from_authorized_user_file('token.json', SCOPES)

if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
    with open('token.json', 'w') as token:
        token.write(creds.to_json())

### Other scopes

If changing scope, token.json needs to be deleted and new token need to be generated using the relevant scope

<table>

<tbody><tr>
  <th width="450">Scope code</th>
  <th>Description</th>
  <th>Usage</th>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>labels</code></td>
  <td>Create, read, update, and delete labels only.</td>
  <td>Recommended</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>send</code></td>
  <td>Send messages only. No read or modify privileges on mailbox.</td>
  <td>Sensitive</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>readonly</code></td>
  <td>Read all resources and their metadata—no write operations.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>compose</code></td>
  <td>Create, read, update, and delete drafts. Send messages and drafts.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>insert</code></td>
  <td>Insert and import messages only.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>modify</code></td>
  <td>All read/write operations except immediate, permanent deletion of
  threads and messages, bypassing Trash.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>metadata</code></td>
  <td>Read resources metadata including labels, history records, and email 
  message headers, but not the message body or attachments.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>settings.<wbr>basic</code></td>
  <td>Manage basic mail settings.</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>www.<wbr>googleapis.<wbr>com/<wbr>auth/<wbr>gmail.<wbr>settings.<wbr>sharing</code></td>
  <td>
Manage sensitive mail settings, including forwarding rules and aliases.
<br>
<br>
<strong>Note:</strong>Operations guarded by this scope are restricted to
administrative use only. They are only available to Google Workspace customers
using a service account with domain-wide delegation.
</td>
  <td>Restricted</td>
</tr>
<tr>
  <td><code translate="no" dir="ltr">https:/<wbr>/<wbr>mail.<wbr>google.<wbr>com/<wbr></code></td>
  <td>Full access to the account’s mailboxes, including permanent deletion of
    threads and messages This scope should only be requested if your application
    needs to immediately and permanently delete threads and messages, bypassing
    Trash; all other actions can be performed with less permissive scopes.</td>
  <td>Restricted</td>
</tr>
</tbody></table>

In [12]:
service = build('gmail', 'v1', credentials=creds)

In [13]:
# Reading mails with labels present in label_ids
label_ids = ['INBOX']
results = service.users().messages().list(userId='me', maxResults=500, labelIds=label_ids).execute()
message_id_list = results["messages"]
iter_num = 1
while True:
    if 'nextPageToken' in results:
        iter_num += 1
        results = service.users().messages().list(userId='me', maxResults=500, labelIds=['INBOX'], pageToken = results['nextPageToken']).execute()
        message_id_list += results["messages"]  
    else:
        print(f"Total Number of iterations = {iter_num}")
        print(f"Total Number of mails fetched = {len(message_id_list)}")
        break

Total Number of iterations = 3
Total Number of mails fetched = 1381


In [10]:
base_dir = "emails_dir"
dump_freq = 10

# Creating base dir if not already present
if not os.path.isdir(base_dir):
    print(f"'{base_dir}' not in current working directory. Creating {os.path.join(os.getcwd(), base_dir)}")
    os.makedirs(os.path.join(os.getcwd(), base_dir))
else:
    print(f"Path already exists. Files maybe overwritten if present")

    
# Starting time stamp for file name
start = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

message_response = []
for i, elem in enumerate(tqdm(message_id_list)):
    if (i % dump_freq == 0) & (i != 0):
        ## Header only for first row
        if i == dump_freq:
            pd.DataFrame(message_response).to_csv(f"{base_dir}/email_{start}.csv", header = True, index = False, mode="a")
        else:
            pd.DataFrame(message_response).to_csv(f"{base_dir}/email_{start}.csv", header = False, index = False, mode="a")
        message_response = []
    message_response_json = service.users().messages().get(userId = "me", id = elem['id']).execute()
    message_response.append(message_response_json)
pd.DataFrame(message_response).to_csv(f"{base_dir}/email_{start}.csv", header = False, index = False, mode="a")

# End time stamp for file renaming
end = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

# Renaming the file
os.rename(os.path.join(base_dir, f"email_{start}.csv"), os.path.join(base_dir, f"email_{start}_{end}.csv"))

  0%|                                                                                                                                                                                                                | 0/2088 [00:00<?, ?it/s]

Path already exists. Files maybe overwritten if present


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2088/2088 [15:42<00:00,  2.22it/s]


In [8]:
results = service.users().labels().list(userId='me').execute()
results

{'labels': [{'id': 'CHAT',
   'name': 'CHAT',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelHide',
   'type': 'system'},
  {'id': 'SENT',
   'name': 'SENT',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelShow',
   'type': 'system'},
  {'id': 'INBOX',
   'name': 'INBOX',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelShow',
   'type': 'system'},
  {'id': 'IMPORTANT',
   'name': 'IMPORTANT',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelShow',
   'type': 'system'},
  {'id': 'TRASH',
   'name': 'TRASH',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelHide',
   'type': 'system'},
  {'id': 'DRAFT',
   'name': 'DRAFT',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelShow',
   'type': 'system'},
  {'id': 'SPAM',
   'name': 'SPAM',
   'messageListVisibility': 'hide',
   'labelListVisibility': 'labelHide',
   'type': 'system'},
  {'id': 'CATEGORY_FORUMS', 'name': 'CATEGO

In [None]:
# ## Other useful commands

# # To change label (can be used to set label to trash, thus deleting it)
# service.users().messages().modify(userId='me', id="156e185c51463696", body={'addLabelIds': ['TRASH']}).execute()