In [46]:
import csv
import string

# helper functions
def get_words(fname):
    '''returns list of processed [no whitespace/capitalization] lines of text from given file'''
    f = open(fname)
    lines_original = f.readlines()
    lines_processed = []
    for line in lines_original:
        lines_processed.append(line.strip().lower())
    f.close()
    lines_processed.sort()
    return lines_processed

def process(message, remove_proper=True):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    global informal_words
    words_original = message.strip().split(" ")
    # remove punctuation and numbers (note to self: consider keeping in emojis)
    words_processed_1 = []
    for word in words_original:
        new_word = word
        for punc in string.punctuation:
            new_word = new_word.replace(punc, "")
        for num in string.digits:
            new_word = new_word.replace(num, "")
        if len(new_word) > 0:
            words_processed_1.append(new_word)
    # remove proper nouns
    if remove_proper:
        words_without_proper = []
        for word in words_processed_1:
            if word.lower() not in english_words and word.lower() not in informal_words and len(word)>1:
                first_letter = word[0]
                rest = word[1:]
                if first_letter.isupper() and rest.islower():
                    continue
            words_without_proper.append(word)
        words_processed_1 = words_without_proper
    # make all words lowercase
    words_processed_final = []
    for word in words_processed_1:
        if word != 'I':
            words_processed_final.append(word.lower())
        else: # exception since 'I' spelled as 'i' is a dead giveaway of many personal messages so keeping track of I capitalization can be useful
            words_processed_final.append(word)
    #print(words_processed_final)
    return words_processed_final
    
# main function for classifying messages
def calculate_index(message):
    '''calculates 'formality index' of a given message'''
    global english_words
    global business_words 
    message_words = process(message)
    # count misspelled words, emojis, abbreviations, etc.
    informal_words_count = 0  
    for word in message_words:
        if word!='I' and (word not in english_words or word in informal_words):
            #print(word)
            informal_words_count += 1
    # count business words
    business_words_count = 0
    for word in message_words:
        if word in business_words:
            business_words_count += 1
    # return index
    num_words = len(message_words)
    if num_words == 0:
        return 0.5
    #print(informal_words_count)
    #print(business_words_count)
    return ((num_words - informal_words_count)/num_words) * 0.75 + (business_words_count/num_words) * 0.25

# uses index to return True (for business) or False (for personal)
def is_business(message, cutoff=0.75):
    '''classifies given message as business or personal'''
    index = calculate_index(message)
    if index>=cutoff:
        return True
    else:
        return False

# main code to check messages
english_words = get_words("/Users/anisha/Downloads/sms-messages/dictionary_2.txt")
business_words = get_words("/Users/anisha/Downloads/sms-messages/business.txt")
informal_words = get_words("/Users/anisha/Downloads/sms-messages/informal.txt")
fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
lines_read = 0
cutoff_num = 1000
# go through set number of messages
with open(fpath, newline='', encoding="latin1") as messages_file:
    lines = csv.DictReader(messages_file)
    for line in lines:
        lines_read += 1
        answer = line['v1']
        message = line['v2']
        sender = is_business(message)
        # update counters for true/false positive/negatives
        if sender==True:
            if answer=="business":
                true_positive += 1
            else:
                false_positive += 1
        else:
            if answer=="personal":
                true_negative += 1
            else:
                false_negative += 1
        if lines_read==cutoff_num:
            break
# output results, calculate relevant measures
accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
precision = true_positive/(true_positive + false_positive)
recall = true_positive/(true_positive + false_negative)
print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))
        








24 true positives, 242 false positives, 650 true negatives, 84 false negatives
accuracy 0.674, precision 0.09022556390977443, recall 0.2222222222222222
