# Gmail Smart Compose: Real-Time Assisted Writing

**Dataset:**

**Name** : Enron Email Dataset

**Location** : https://www.cs.cmu.edu/~./enron/

**About** : This dataset was collected and prepared by the CALO Project (A Cognitive Assistant that Learns and Organizes). It contains data from about 150 users, mostly senior management of Enron, organized into folders.




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import nltk
import pandas as pd
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

In [None]:
#import tarfile

#opening file
#file = tarfile.open('/content/drive/MyDrive/enron_mail_20150507.tar.gz')

#extracting file to /content/drive/MyDrive/CaseStudy2/EmailDataset

#file.extractall('/content/drive/MyDrive/CaseStudy2/EmailDataset')

#file.close()


In [None]:
import glob

#path for all files
path = '/content/drive/MyDrive/CaseStudy2/EmailDataset/maildir/*/_sent_mail/*.'

file_name_iterator = glob.glob(path)

In [None]:
print(len((file_name_iterator)))

22379


In [None]:
import glob

#path for all files
path = '/content/drive/MyDrive/CaseStudy2/EmailDataset/maildir/*/*/*.'

all_files = glob.glob(path)

In [None]:
print(len(all_files))

397441


In [None]:
difference = set(all_files) - set(file_name_iterator)

In [None]:
print(len(difference))

375062


In [None]:
difference = list(difference)
difference = difference[0:70000]

In [None]:
file_name_iterator.extend(difference)

In [None]:
print(len(file_name_iterator))

92379


In [None]:
file_names = file_name_iterator

In [None]:
len(file_names)

92379

In [None]:
import pandas as pd

In [None]:
data = pd.DataFrame(columns = ['ID','ForeignID','subject','body'])

In [None]:
data.head()

Unnamed: 0,ID,ForeignID,subject,body


In [None]:
def preprocessing_subject(str):
  #remove RE:
  str = re.sub(r'RE:|Re:',' ',str)
  #remove FW:
  str = re.sub(r'FW:|Fw:',' ',str)
  #removing new line characters, tab characters,dashs
  str = re.sub(r"[\n|\t]", " ", str)
  #removing HTML Tags
  str = BeautifulSoup(str,'html.parser')
  str = str.text
  return str

In [None]:
def preprocessing(str):
  #remove all the emails
  str = process_email(str)
  #remove From:
  str = re.sub(r'From:.+\n','',str)
  #remove To:
  str = re.sub(r'To:.+\n','',str)
  #remove sent:
  str = re.sub(r'Sent:.+\n','',str)
  #remove bcc:
  str = re.sub(r'Bcc:.+\n','',str)
  #removing new line characters, tab characters,dashs
  str = re.sub(r"[\n|\t]", " ", str)
  #removing digits
  #str = re.sub(r'\d+',' ',str)
  #removing HTML Tags
  str = BeautifulSoup(str,'html.parser')
  str = str.text
  #removing website names
  str = re.sub(r'http\S+',"[website]",str)
  #using chucking to remove the names of people and organizations
  str = chunking(str)
  return str

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
def process_email(str):
  listofemail = re.sub(r'[\w.+-]+@[\w-]+\.[\w.-]+',' ',str)
  return listofemail

In [None]:
def chunking(str):
  parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(str.split()))  # POS tagging before chunking!
  str = " "
  for i in list(parse_tree):
    if(isinstance(i,nltk.tree.Tree)):
      if((i.label()!='PERSON') & (i.label()!='ORGANIZATION')):
        for index,leaf in enumerate(i.leaves()):
            str = str + leaf[0]+ " "
      elif((i.label()=='PERSON')):
        str = str + '[name]' + " "
      else:
        str = str + '[org]' + " "
    else:
      str = str + i[0]+" "
  return str

In [None]:
chunking("Eric Thode@ENRON 11/30/2000 08:25 AM")

' [name] Thode@ENRON 11/30/2000 08:25 AM '

In [None]:
#file_names = ['Testing.txt']

In [None]:
#file_name = 'Testing.txt'

global_id = 0;
for file_name in tqdm(file_names):
  with open(file_name, encoding="utf8", errors='ignore') as file:
    lines = file.readlines()
    indexes = []
    for index,line in enumerate(lines):
      if(('--Original Message--' in line) | ('-- Forwarded by' in line)):
        indexes.append(index)
    indexes.append(len(lines))
    prev = 0
    skips = ['-- Forwarded by','cc',"Cc:","--Original Message--","Message-ID:",'Mime-Version:','Content-Type:','Content-Transfer-Encoding:','X-']
    foreign_id = -1
    for index in indexes:
      date = ""
      subject = ""
      body= ""
      for line in lines[prev:index]:
        flag_skip = 0
        for skip_words in skips:
          if skip_words in line:
            flag_skip = 1
            break
        if flag_skip == 1:
          continue
        if "Date:" in line:
          date = line[5:]
        elif "Sent:" in line:
          date = line[5:]
        elif "Subject:" in line:
          subject = line[8:] 
        else:
          body = body + line
      prev = index
      body = preprocessing(body)
      subject = preprocessing_subject(subject)
      data = data.append({'ID':global_id,'ForeignID':foreign_id,'subject':subject,'body':body},ignore_index=True)
      foreign_id = global_id
      global_id +=1
           




100%|██████████| 92379/92379 [5:32:19<00:00,  4.63it/s]


In [None]:
data.head()

Unnamed: 0,ID,ForeignID,subject,body
0,0,-1,,[website]
1,1,-1,,
2,2,-1,interesting article on lisp and software deve...,oops. [website] definitely read at least the ...
3,3,-1,interesting article on lisp and software deve...,"if you have a bit of time, you might find it ..."
4,4,-1,watch other eol products,how do you watch other eol products -- to che...


In [None]:
data.to_csv('email_prepared_data_v2.csv')

In [None]:
print(data.iloc[0]['body'])
print('*'*20)
print(data.iloc[1]['body'])
print('*'*20)
print(data.iloc[2]['body'])
print('*'*20)
print(data.iloc[3]['body'])
print('*'*20)
print(data.iloc[4]['body'])
print('*'*20)
print(data.iloc[-3]['body'])
print('*'*20)
print(data.iloc[-1]['body'])

 [website] 
********************
 
********************
 oops. [website] definitely read at least the 2nd to last paragraph in the pdf. [org] HEIZENRADER 05/01/2001 03:38 PM Umm... was there to be an attachment or a URL? Tim if you have a bit of time, you might find it entertaining. 
********************
 if you have a bit of time, you might find it entertaining. 
********************
 how do you watch other eol products -- to check for the fake transaction? 
********************
 I made a new fatboy sheet that should fit all our customers profile. The template is in the new folder P\RealTime\Fatboy\Fatboysheet Template. Hopefully this can limit the number of different sheets we currently are using. I put it in the P drive as it is local and will limit our dependency on terminal server. Geir 
********************
 09:06 AM --------------------------- [name] < > on 05/01/2000 04:26:30 AM Vince, How are you? Hope all is well. Is there any chance we can schedule my visit to [name] on Frid

In [None]:
abc = process_email(data.iloc[0]['body'])

In [None]:
abc

'From: \nTo: , , , \n\t, , , \n\t, , , \n\t, , , \n\t, , , \n\t, , \n\t, , \n\t, , \n\t, , \n\t, , \n\t, \n\nAttached is a redline redraft of the Joint Initial Brief Opposing Limitations\non Lifting the Rate Cap to Exclude Pipeline Short-Term Service.  Please let\nme know if your company intends to be a petitioner or intervener in support\non the brief.  Consistent with an e-mail you will be receiving from Joan\nDreskin, please furnish any comments to me by Wednesday, March 21, and we\nwill e-mail a final draft to you by Tuesday, March 27.\n\nJames D. McKinney, Jr.\n\n - Redline.98-1333.Brief.doc '