In [1]:
import pandas as pd
from pandas import Series
from Keywords import *
from nltk import sent_tokenize
import numpy as np
from rake_nltk import Rake, Metric
import yake

df = pd.read_csv('CleanedPolite.csv')

In [2]:
def remove_suffix(s: str, end: str, add: str = '', ignore_errors: bool = True):
    """
    Returns a string (s) up to before the the cut off str (end) and appends 
    (add) to string. If end can't be found, this returns the original string, 
    but there is option to raise exception if end is not found.
    """
    if ignore_errors:
        i = s.rfind(end)
    else:
        i = s.rindex(end)
    return s[:i] + add

# Preps email row before keyword extraction
def get_complete_body(data_row: Series) -> str:
    """
    Given a row of email data, returns a complete body of data about the email.
    This adjusts the subject and description of the email before returning
    their combination.

    Input:
        data_row: row corresponding to a single direct inject

    Returns: str of a cleaned email description, ready for keyword extraction
    """
    s_email = shorten_email(data_row['description']).lower()
    s_subject = remove_suffix(data_row['Subject'][3:], ' - ') #Removes Q: & name
    complete_body = s_subject + ', ' + s_email
    complete_body = complete_body.lower()

    return complete_body

# Trying out YAKE
def yake_extract(complete_body: str) -> list | list[tuple[str, float]]:
    """
    Given a complete_body string, returns a list of keyword phrases and their 
    scores from the yake_extractor. Lower score is better.
    """
    language = "en"
    max_ngram_size = 5
    deduplication_threshold = 0.9
    windowSize = 3
    numOfKeywords = 6

    yake_kw_extractor = yake.KeywordExtractor(
        lan = language, 
        n = max_ngram_size, 
        dedupLim = deduplication_threshold, 
        windowsSize = windowSize, 
        top = numOfKeywords
    )

    key_phrases = yake_kw_extractor.extract_keywords(complete_body)
    return key_phrases

# Trying out RAKE
def rake_extract(complete_body: str) -> list[tuple[float, str]]:
    """
    Given a complete_body string, returns a list of keyword phrases and their 
    scores from the yake_extractor. Higher score is better.
    """
    stop = set(stopwords.words("english")) # Not sure if they do anything
    rake_extractor = Rake(ranking_metric=Metric.WORD_FREQUENCY)
    rake_extractor.extract_keywords_from_text(complete_body)
    return rake_extractor.get_ranked_phrases_with_scores()

# Combining Yake and Rake
def get_comment_yake(y_phrases, r_phrases) -> str:
    """
    For some algorithm of choosing keywords (TBD), this takes the yake and rake
    keywords and returns a fitting email comment.

    Inputs:
        y_phrases: list of keyword phrases from Yake using complete_body
        r_phrases: list of keyword phrases from Rake using complete_body

    Returns: str of the email comment
    """
    for r in r_phrases:
        for y in y_phrases:
            if r in y:
                return r

In [3]:
# Mass test: create comment for every email row
comments = []
for i, row in df.iterrows():
    a = get_complete_body(row)
    b = yake_extract(a)
    c = rake_extract(a)
    
    y = [i for i,j in b]
    r = [j for i,j in c]

    comments.append(get_comment_yake(y,r))

mycomments = pd.Series(comments)
mycomments.to_csv('RakeOutputs.csv') #Currently doesn't merge into df
# df['Test Comments'] = comments
# df.to_csv('TestOutput.csv')

In [4]:
# Notes and Tests on individual rows
mini_test_rows = [2] #Ones that worked previously
test_row = df.iloc[2] #row 6 shows i can't assume email ends after thank you

a = get_complete_body(test_row)
b = yake_extract(a)
c = rake_extract(a)

y = [i for i,j in b]
r = [j for i,j in c]

print(get_comment_yake(y,r))
print('AAAAAA', a)
print('YYYYYYY', y)
print('RRRRRRR', r)
print(test_row['description'])

# further analysis of pos tags on these yake rake keywords
# sentences = nltk.sent_tokenize(a)
# words = [word for sent in sentences for word in nltk.word_tokenize(sent)]
# nltk.pos_tag(words)

editing templates
AAAAAA politemail help,  hi,  is there someone available to walk me through building and editing templates? i am having difficulty editing/changing the formatting for existing templates from my predecessor, specifically with certain images not displaying and other background issues.  ashley wynstra communications manager  campus and student life the university of chicago behar family house 5711 s. woodlawn ave. chicago, il 60637 awynstra@uchicago.edu<mailto:awynstra@uchicago.edu
YYYYYYY ['walk me through building', 'building and editing templates', 'uchicago.edu', 'politemail', 'changing the formatting for existing', 'formatting for existing templates']
RRRRRRR ['chicago behar family house 5711', 'ashley wynstra communications manager campus', 'il 60637 awynstra', 'editing templates', 'existing templates', 'difficulty editing', 'woodlawn ave', 'uchicago', 'uchicago', 'student life', 'someone available', 'politemail help', 'edu', 'edu', 'chicago', 'certain images', 'ba

In [5]:
# Separate Analysis on pos tags
# Counting number of pos tags and words
df['Comment'].apply(lambda x: len(word_tokenize(x))).value_counts()
df['Comment'][df['Comment'].apply(lambda x: len(word_tokenize(x))) >= 5]

tag_count: dict[str, int] = {}
tag_word: dict[str, list[str]] = {}
comment_tags = df['Comment'].apply(nltk.word_tokenize).apply(nltk.pos_tag)
for tag_list in comment_tags:
    for tag in tag_list:
        if tag_count.get(tag[1]) is None:
            tag_count[tag[1]] = 1
            tag_word[tag[1]] = [tag[0]]
        else:
            entry = tag_count[tag[1]]
            tag_count[tag[1]] += 1
            tag_word[tag[1]].append(tag[0])

tag_count
# print(len(set(tag_word['NNP'])))
word_count = {word: tag_word['NNP'].count(word) for word in tag_word['NNP']}
# word_count = np.array(list(word_count.values()))