# Imports

In [14]:
import pypff
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import HTML
import re, os
import email, datetime
from tqdm import tqdm
tqdm.pandas()

In [15]:
print('Session PID :', os.getpid())

start_time = datetime.datetime.now()
path = "/home/belalm/email_analysis/data/raw/gmail_pst.pst"  # PST file path
opst = pypff.open(path)
root = opst.get_root_folder()

Session PID : 7416


# Data Ingest

In [16]:
def get_data_from_object(obj):
    number_of_sub_folders = obj.get_number_of_sub_folders()
    number_of_entries = obj.get_number_of_entries()
    number_of_sub_items = obj.get_number_of_sub_items() 
    number_of_sub_messages = obj.get_number_of_sub_messages()
    number_of_record_sets = obj.get_number_of_record_sets()
    entries = [obj.get_entry(i) for i in range(number_of_entries)]
    folders = [obj.get_sub_folder(i) for i in range(number_of_sub_folders)]
    data = {i.get_name(): i for i in folders} 
    final_dict = {
        'number_of_sub_folders': number_of_sub_folders,
        'number_of_entries': number_of_entries,
        'number_of_sub_items': number_of_sub_items,
        'number_of_sub_messages': number_of_sub_messages,
        'number_of_record_sets': number_of_record_sets,
        'data': data,
        'entries': entries
    }
    return final_dict

In [17]:
root_data = get_data_from_object(root)
root_data

AttributeError: 'pypff.folder' object has no attribute 'get_entry'

In [12]:
Outlook = root_data['data']['Top of Outlook data file']
Outlook_Data = get_data_from_object(Outlook)
Outlook_Data

{'number_of_sub_folders': 8,
 'number_of_entries': 4,
 'number_of_sub_items': 8,
 'number_of_sub_messages': 0,
 'number_of_record_sets': 1,
 'data': {'Deleted Items': <pypff.folder at 0x7f75375d8060>,
  'Inbox': <pypff.folder at 0x7f7509f5bf90>,
  'Outbox': <pypff.folder at 0x7f7509f5b3f0>,
  'Sync Issues (This computer only)': <pypff.folder at 0x7f754c133720>,
  '[Gmail]': <pypff.folder at 0x7f7509e10300>,
  'Banking': <pypff.folder at 0x7f7509e11e60>,
  'Notes': <pypff.folder at 0x7f7509e106c0>,
  'Receipt': <pypff.folder at 0x7f7509e13540>}}

In [13]:
Inbox = Outlook_Data['data']['Inbox']
Inbox_Data = get_data_from_object(Inbox)
Inbox_Data

{'number_of_sub_folders': 0,
 'number_of_entries': 20,
 'number_of_sub_items': 5039,
 'number_of_sub_messages': 5039,
 'number_of_record_sets': 1,
 'data': {}}

In [None]:
Sent = Outlook_Data['data']['Sent Items']
Sent_Data = get_data_from_object(Sent)
Sent_Data

# Sample Mail

In [None]:
def displayInhtml(index):

    sample=Inbox.get_sub_message(index)

    if sample.html_body:

        html_string = sample.html_body.decode("utf-8")

        display(HTML(html_string))


sample=Inbox.get_sub_message(576)

print(sample.plain_text_body.decode('UTF-8'))

## HTML Format

In [None]:
if sample.html_body:
    html_string = sample.html_body.decode("utf-8")
    display(HTML(html_string))

if sample.rtf_body:
    rtf = sample.rtf_body.decode("utf-8")
    display(HTML(rtf))

### Available Data

In [None]:
for i in dir(sample):
    if i.startswith('get'):
        print(i)

### Convert To Data

In [None]:
def processMessage(message):
    """
    The processMessage function processes multi-field messages to simplify collection of information
    :param message: pypff.Message object
    :return: A dictionary with message fields (values) and their data (keys)
    """
    return {
        "id": message.identifier,
        "subject": message.subject,
        "sender_name": message.sender_name,
        "header": message.transport_headers,
        "body": message.plain_text_body.decode('utf-8') if message.plain_text_body is not None else None,
        "creation_time": message.creation_time,
        "submit_time": message.client_submit_time,
        "delivery_time": message.delivery_time,
        "attachment_count": message.number_of_attachments,
    }
def checkForMessages(folder):
    """
    The checkForMessages function reads folder messages if present and passes them to the report function
    :param folder: pypff.Folder object
    :return: None
    """
    message_list = []
    for message in tqdm(folder.sub_messages):
        message_dict = processMessage(message)
        message_list.append(message_dict)
    df = pd.DataFrame(message_list)
    df.insert(0, 'Folder', folder.name)
    return df

def is_valid_email(email):
    """
    The is_valid_email function checks if an email address is valid
    :param email: str
    :return: Tuple (bool, str)
    """
    if email is None:
        return False, None
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,5}$'
    if re.match(pattern, email):
        return True, email
    else:
        return False, None

def get_data_from_header(header):
    """
    The get_data_from_header function extracts email addresses from the email header
    :param header: str
    :return: Tuple (str, str, str, str)
    """
    if header is None:
        return None, None, None, None
    header = email.message_from_string(header)
    From = header.get('From')
    To = header.get('To')
    CC = header.get('CC')
    BCC = header.get('BCC')
    return From, To, CC, BCC

In [None]:
df = pd.DataFrame()
for fol, folder in Outlook_Data['data'].items():
    print("Folder Name : ", fol) # Inbox, Sent, etc.
    temp = checkForMessages(folder)
    df = df.append(temp)

# df.id=df.id.astype('int')

df['From'], df['To'], df['CC'], df['BCC'] = zip(*df.header.progress_apply(get_data_from_header))
df.Folder.value_counts()

In [None]:
import email
from email.header import decode_header

# Process Emails

In [None]:
def get_external_flag(email):
    if email is None:
        return None
    if 'qib.com.qa' in email.lower():
        return 0
    else:
        return 1

def get_mail_flag(subject):
    if subject is None:
        return 'SUBJECT NOT FOUND'
    if subject.upper().startswith('[MARKETING]'):
        return "MARKETING"
    if subject.upper().startswith('WARNING:'):
        return "WARNING"
    if subject.upper().startswith('RE:'):
        return "REPLY"
    if subject.upper().startswith('FW:'):
        return "FORWARDED"
    else:
        return "NA"
    
# Function to check if an email is the first in a conversation
def is_first_in_conversation(email_message):
    flag = 0
    subject = email_message.subject
    email_message = email_message.header
    if email_message is None:
        return None
    email_message = email.message_from_string(email_message)
    message_id = email_message.get("Message-ID")
    in_reply_to = email_message.get("In-Reply-To")  # "Thread-Topic"
    # If Message-ID is present and In-Reply-To is missing or empty, it's likely the first email
    if message_id and (not in_reply_to or in_reply_to.strip() == "<>"):
        flag = 1
    if subject is not None:
        if flag and not subject.lower().startswith('re:'):
            flag = 1
        else:
            flag = 0
    return flag

def is_english_or_arabic(text, check_arabic=True):
    if text is None:
        return False
    try:
        text = text.strip()
        # Define regex patterns for English and Arabic characters
        english_pattern = re.compile(r"[A-Za-z]")
        arabic_pattern = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]")
        # Check if the text contains English characters
        if english_pattern.search(text) and not check_arabic:
            return True
        # Check if the text contains Arabic characters
        if arabic_pattern.search(text) and check_arabic:
            return True
    except:
        pass
    return False

def extract_emails_and_domains(text):
    if text is None:
        return None, None
    # Define a regex pattern for matching email addresses
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    # Use re.findall to find all email addresses in the text
    emails = re.findall(email_pattern, text)
    # Extract domains from email addresses
    domains = [re.search(r'@([A-Za-z0-9.-]+\.[A-Za-z]{2,})', email).group(1) for email in emails]
    return ''.join(emails), ''.join(domains).lower()

In [None]:
df['FROM_EMAIL'], df['FROM_DOMAIN'] = zip(*df.From.progress_apply(extract_emails_and_domains))
df['is_from_external'] = df.From.progress_apply(get_external_flag)
df['is_first_in_conversation'] = df.progress_apply(is_first_in_conversation, axis=1)
df['SUBJECT_FLAG'] = df.subject.progress_apply(get_mail_flag)

In [None]:
def get_first_mail(email_text):
    if email_text is None:
        return None
    email_text = re.sub(r'\s+', ' ', email_text.split('From: ')[0])
    email_text = email_text.strip()
    return email_text

In [None]:
text_for_remove = [
    "CAUTION: This email originated from outside QIB. Do not click any links or open attachments unless you are sure of the safety of the contents."
]

In [None]:
def clean_mail(email_text, text_for_remove=None):
    if email_text is None:
        return None
    try:
        email_text = email_text.decode('utf-8')
    except:
        pass
    pattern = re.compile(r'<.*?>')
    # Use sub() method to replace matched tags with an empty string
    email_text = re.sub(pattern, '', email_text)
    if text_for_remove:
        for i in text_for_remove:
            email_text = email_text.replace(i, "")
   
    signature_pattern = r'--\s*\n.*$|\n\s*sent from[^\n]*|\n\s*disclaimer[^\n]*|\n\s*confidential[^\n]*|\n\s*unsubscribe[^\n]*'
    # Remove the signature pattern from the email text
    email_text = re.sub(signature_pattern, '', email_text, flags=re.IGNORECASE)
    email_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', email_text)
    email_text = re.sub(r'\S+@\S+', '', email_text)
    email_text = email_text.split('From: ')
    email_text = [re.sub(r'\s+', ' ', i) for i in email_text]
    email_text = '\n\nFrom: '.join(email_text)
    email_text = email_text.strip()
    if text_for_remove:
        for i in text_for_remove:
            email_text = email_text.replace(i, "")
    pattern = r'Sent from Yahoo Mail on Android <.*?>'
    email_text = re.sub(pattern, '', email_text)
    email_text = re.sub(r'-+|<.*?>', '', email_text)
    email_text = re.sub(r'\S+@\S+', '', email_text)
    email_text = re.sub(r'\d', '', email_text)
    footer_pattern = re.compile(r'Unsbcrib.*?Qatar Islamic Bank.*?Â©')
    email_text = re.sub(footer_pattern, '', email_text)
   
    pattern = re.compile(re.escape('____') + r'.*')
    # Remove text after the marker
    email_text = re.sub(pattern, '', email_text)
   
    return email_text

In [None]:
email_text = sample.plain_text_body.decode('utf-8')
print(email_text)
print(clean_mail(email_text, text_for_remove))
print(email_text)
print(clean_mail(sample.plain_text_body.decode('utf-8'), text_for_remove))

clean_mail(df.iloc[10].body, text_for_remove)

df['clean_body'] = df.body.progress_apply(lambda x: clean_mail(x, text_for_remove))
df['first_mail'] = df.clean_body.progress_apply(get_first_mail)

df['is_english'] = df.first_mail.progress_apply(lambda x: is_english_or_arabic(x, False)).astype(int)
df['is_arabic'] = df.first_mail.progress_apply(lambda x: is_english_or_arabic(x)).astype(int)

# N - Gram

In [None]:
import os
# os.environ['XDG_CACHE_HOME']='Z:\AI_Models'
from keybert import KeyBERT

kw_model = KeyBERT()
seed_keywords = None
stop_words = ['hello', 'peace', 'best regards']

In [None]:
def remove_stop_words(input_string, stop_words):
    # Tokenize the input string into words
    input_string = input_string.lower()
    for word in stop_words:
        input_string = input_string.replace(word.lower(), '')
    words = input_string.split()
    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Reconstruct the string without stop words
    output_string = ' '.join(filtered_words)
    return output_string

def get_ngram(text, n=1, top_n=100, stop_words=None):
    try:
        text = remove_stop_words(text, stop_words)
        keywords = kw_model.extract_keywords(text,
                                             keyphrase_ngram_range=(n, n),
                                             stop_words=stop_words,
                                             top_n=top_n, seed_keywords=seed_keywords)
        return ",".join([i[0] for i in keywords])
    except Exception as e:
        print(e)
        return None

In [None]:
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def get_ngram_nltk(text, n=1, flag=True):
    out = None
    try:
        text = remove_stop_words(text, stop_words)
        if text is None or len(text) < 10:
            return ''
        text = re.sub('[^A-Za-z0-9\s+]+', '', text.strip())
        if flag:
            text = ' '.join(text.split())  
            out = ngrams(text.strip().split(), n)
            out = [' '.join(i) for i in out]
            out = ','.join(out)
        else:
            vect = CountVectorizer(stop_words='english', ngram_range=(n, n))
            vect.fit_transform([text.strip()])
            out = ','.join(vect.get_feature_names_out())
    except:
        print(print(">>", text))
   
    return out

In [None]:
df['UNIGRAM'] = df.first_mail.progress_apply(lambda x: get_ngram_nltk(x, 1, False))
df['BIGRAM'] = df.first_mail.progress_apply(lambda x: get_ngram_nltk(x, 2))
df['TRIGRAM'] = df.first_mail.progress_apply(lambda x: get_ngram_nltk(x, 3))

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
get_ipython().run_line_magic('matplotlib', 'inline')

In [None]:
remove_wordss = ['email', 'dear', 'regards', 'thank',
                 'kindly', 'thanks', 'hi', 'im', 'sir',
                 'in the', 'of the', 'to the', 'for the', 'want to', 'you are', 'all rights reserved', 'if you',
                 'and the', 'want to', 'on the',
                ]

In [None]:
def get_worldcloud(column):
    column = column[~column.isna()].tolist()
    one_string = ','.join(column)
    one_string = re.sub(' +|\s+', ' ', one_string)
    one_string = re.sub(' +|\s+', ' ', one_string)
    one_string = re.sub('\s+,', ',', one_string)
    one_string = re.sub(',\s+', ',', one_string)
    listword = one_string.split(',')
    listword = [i for i in listword if i not in remove_wordss]
    ngram_freq = Counter(listword)
    word_cloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(ngram_freq)
    return ngram_freq, word_cloud

In [None]:
df1 = df[(df.Folder == 'Inbox') &
         (df.is_from_external == 1) &
         (df.is_english == 1) &
         (df.is_arabic == 0) &
         (df.is_first_in_conversation == 1) &
         (df.SUBJECT_FLAG != 'MARKETING') &
         (df.FROM_DOMAIN != 'hdfcbank.com')
        ]

ngramdf = []
for col in ['UNIGRAM', 'BIGRAM', 'TRIGRAM']:
    unigram_freq, word_cloud_unigram = get_worldcloud(df1[col])
    unigram = pd.DataFrame(unigram_freq, index=['Count']).rename_axis(col).T.sort_values('Count', ascending=False)
    unigram = unigram.reset_index().rename(columns={'index': col})
    unigram.to_csv(f'{col}.csv', index=False)
    ngramdf.append(unigram)
    # Display the generated Word Cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(word_cloud_unigram)
    plt.axis('off')
    plt.suptitle(f'{col} Keywords', fontsize=16)
    plt.savefig(f"{col}.png")
    plt.show()

ngram_df = pd.concat(ngramdf, axis=1)
ngram_df.to_csv('ngram_data.csv', index=False)

In [None]:
import os
# os.environ['XDG_CACHE_HOME']='Z:\AI_Models'
from stormtrooper import ZeroShotClassifier

labels = ['COMPLAINTS',
          'SALES_REQUESTS',
          'SERVICE_REQUESTS',
          'APPRECIATIONS',
          'SPAM',
          'OTHER',
          'FOLLOW_UP']

model = ZeroShotClassifier("facebook/bart-large-mnli").fit(None, labels)

labels = ['COMPLAINTS',
          'SALES_REQUESTS',
          'SERVICE_REQUESTS',
          'APPRECIATIONS',
          'SPAM',
          'OTHER',
          'FOLLOW_UP']
labels = sorted(labels)
for i, j in enumerate(labels):
    k = j.replace('_', " ")
    print(f"## {i+1}.{k.title()}<a id='S{i}'></a>")  # 1. Customer Request

df1.to_excel('email_20231210.xlsx')

model.set_output(transform='pandas')

df1 = df[(df.Folder == 'Inbox') &
         (df.is_from_external == 1) &
         (df.is_english == 1) &
         (df.is_arabic == 0) &
         (df.is_first_in_conversation == 1) &
         (df.SUBJECT_FLAG != 'MARKETING') &
         (df.FROM_DOMAIN != 'hdfcbank.com')
        ]