In [10]:
import pandas as pd
from pandas import Series
from Keywords import *
from nltk import sent_tokenize
import numpy as np
from rake_nltk import Rake, Metric
import yake

df = pd.read_csv('CleanedPolite.csv')
# series_nltk = df['description'].apply(get_comment_from_pos)
# print(series_nltk)

# test_row = df.iloc[2]

def remove_suffix(s: str, end: str, add: str = '', ignore_errors: bool = True):
    """
    Returns a string (s) up to before the the cut off str (end) and appends 
    (add) to string. If end can't be found, this returns the original string, 
    but there is option to raise exception if end is not found.
    """
    if ignore_errors:
        i = s.rfind(end)
    else:
        i = s.rindex(end)
    return s[:i] + add



In [11]:
def get_complete_body(data_row: Series) -> str:
    """
    Given a series with the below columns, returns the subject line with
    email body. Adjustments made to the final string.
    """
    s_email = shorten_email(data_row['description']).lower()
    s_subject = remove_suffix(data_row['Subject'][3:], ' - ')
    complete_body = s_subject + ', ' + s_email
    complete_body = complete_body.lower()

    return complete_body

# Trying out YAKE
def yake_extract(complete_body: str) -> list | list[tuple[str, float]]:
    """
    Given a complete_body string generated from get_complete_body(), returns
    a list of keyword phrases and their score from the yake_extractor. Lower
    score is better.
    """
    language = "en"
    max_ngram_size = 5
    deduplication_threshold = 0.9
    windowSize = 3
    numOfKeywords = 6

    yake_kw_extractor = yake.KeywordExtractor(
        lan = language, 
        n = max_ngram_size, 
        dedupLim = deduplication_threshold, 
        windowsSize = windowSize, 
        top = numOfKeywords
    )

    key_phrases = yake_kw_extractor.extract_keywords(complete_body)
    return key_phrases

In [12]:
# Trying out RAKE
def rake_extract(complete_body: str) -> list[tuple[float, str]]:
    """
    Given a complete_body string generated from get_complete_body(), returns
    its keyword phrases with their scores. For Rake, a higher score is better.
    """
    stop = set(stopwords.words("english")) # Not sure if they do anything
    rake_extractor = Rake(ranking_metric=Metric.WORD_FREQUENCY)
    rake_extractor.extract_keywords_from_text(complete_body)
    return rake_extractor.get_ranked_phrases_with_scores()

In [13]:
# Combining Yake and Rake
def get_comment_yake(y_phrases, r_phrases) -> str:
    """
    Returns a generated email comment. 

    Inputs:
        y_phrases: list of keyword phrases from Yake using complete_body
        r_phrases: list of keyword phrases from Rake using complete_body

    Returns: str of the email comment
    """
    for r in r_phrases:
        for y in y_phrases:
            if r in y:
                return y

comments = []

for i, row in df.iterrows():
    a = get_complete_body(row)
    b = yake_extract(a)
    c = rake_extract(a)
    
    y = [i for i,j in b]
    r = [j for i,j in c]

    comments.append(get_comment_yake(y,r))

df['Test Comments'] = comments
df.to_csv('TestOutput.csv')

In [14]:
mini_test_rows = [2] #Ones that worked previously
test_row = df.iloc[6] #row 6 shows i can't assume email ends after thank you
a = get_complete_body(test_row)
b = yake_extract(a)
c = rake_extract(a)

y = [i for i,j in b]
r = [j for i,j in c]

print(get_comment_yake(y,r))
print('AAAAAA', a)
print('YYYYYYY', y)
print('RRRRRRR', r)
print(test_row['description'])
# print(test_row['Comment'])

# sentences = nltk.sent_tokenize(a)
# words = [word for sent in sentences for word in nltk.word_tokenize(sent)]
# nltk.pos_tag(words)

shapes in a politemail template
AAAAAA building buttons/shapes in a politemail template,  greetings,  
YYYYYYY ['shapes in a politemail template', 'building buttons', 'shapes in a politemail', 'politemail template', 'building', 'buttons']
RRRRRRR ['politemail template', 'building buttons', 'shapes', 'greetings']
Email To: politemailsupport@uchicago.edu  Email From: bransom@uchicago.edu  Email Text: Greetings,  Thanks again for your PoliteMail training a few weeks ago. I'm starting to deploy messages and things are going well.  I'm wondering if you could help me navigate an issue with building shapes and buttons in a template that are well-centered in the message. I'd like to build something resembling a button (with a "Watch Video" CTA) that would be perfectly centered. I'm having trouble with a particular case where the button is slightly off center and things don't quite look right. I can forward to you to give you a sense of this.  Would you be willing to talk through this and help 

In [15]:
# Counting number of pos tags and words
df['Comment'].apply(lambda x: len(word_tokenize(x))).value_counts()
df['Comment'][df['Comment'].apply(lambda x: len(word_tokenize(x))) >= 5]

tag_count: dict[str, int] = {}
tag_word: dict[str, list[str]] = {}
comment_tags = df['Comment'].apply(nltk.word_tokenize).apply(nltk.pos_tag)
for tag_list in comment_tags:
    for tag in tag_list:
        if tag_count.get(tag[1]) is None:
            tag_count[tag[1]] = 1
            tag_word[tag[1]] = [tag[0]]
        else:
            entry = tag_count[tag[1]]
            tag_count[tag[1]] += 1
            tag_word[tag[1]].append(tag[0])

tag_count
# print(len(set(tag_word['NNP'])))
word_count = {word: tag_word['NNP'].count(word) for word in tag_word['NNP']}
# word_count = np.array(list(word_count.values()))
# word_count
# word_count