In [2]:
import csv
import string

# helper functions
def contains(inputList, element):
  '''uses binary search to find if a given list contains a certain element'''
  low = 0
  high = len(inputList)
  while low < high:
    mid = (low+high)//2
    #print(low, high, mid)
    if inputList[mid] == element:
      return True
    elif inputList[mid] < element:
      low = mid + 1
    else:
      high = mid - 1
  return False
  
  
def get_words(fname):
    '''returns list of processed [no whitespace/capitalization] lines of text from given file'''
    f = open(fname)
    lines_original = f.readlines()
    lines_processed = []
    for line in lines_original:
        lines_processed.append(line.strip().lower())
    f.close()
    lines_processed.sort()
    return lines_processed

def process(message, remove_proper=True):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    global informal_words
    words_original = message.strip().split(" ")
    # remove punctuation and numbers (note to self: consider keeping in emojis)
    words_processed_1 = []
    for word in words_original:
        new_word = word
        for punc in string.punctuation:
            new_word = new_word.replace(punc, "")
        for num in string.digits:
            new_word = new_word.replace(num, "")
        if len(new_word) > 0:
            words_processed_1.append(new_word)
    # remove proper nouns
    if remove_proper:
        words_without_proper = []
        for word in words_processed_1:
            if not contains(english_words, word.lower()) and not contains(informal_words, word.lower()) and len(word)>1:
                first_letter = word[0]
                rest = word[1:]
                if first_letter.isupper() and rest.islower():
                    continue
            words_without_proper.append(word)
        words_processed_1 = words_without_proper
    # make all words lowercase
    words_processed_final = []
    for word in words_processed_1:
        if word != 'I':
            words_processed_final.append(word.lower())
        else: # exception since 'I' spelled as 'i' is a dead giveaway of many personal messages so keeping track of I capitalization can be useful
            words_processed_final.append(word)
    #print(words_processed_final)
    return words_processed_final
    
# main function for classifying messages
def calculate_index(message):
    '''calculates 'formality index' of a given message'''
    global english_words
    global business_words 
    message_words = process(message)
    # count misspelled words, emojis, abbreviations, etc.
    informal_words_count = 0  
    for word in message_words:
        if word!='I' and (not contains(english_words, word) and not contains(informal_words, word)):
            #print(word)
            informal_words_count += 1
    # count business words
    business_words_count = 0
    for word in message_words:
        if word in business_words:
            business_words_count += 1
    # return index
    num_words = len(message_words)
    if num_words == 0:
        return 0.5
    #print(informal_words_count)
    #print(business_words_count)
    return ((num_words - informal_words_count)/num_words) * 0.75 + (business_words_count/num_words) * 0.25

# uses index to return True (for business) or False (for personal)
def is_business(message, cutoff=0.75):
    '''classifies given message as business or personal'''
    index = calculate_index(message)
    if index>=cutoff:
        return True
    else:
        return False

# main code to check messages
english_words = get_words("/Users/anisha/Downloads/sms-messages/dictionary_2.txt")
business_words = get_words("/Users/anisha/Downloads/sms-messages/business.txt")
informal_words = get_words("/Users/anisha/Downloads/sms-messages/informal.txt")
fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
lines_read = 0
cutoff_num = 1000
# go through set number of messages
with open(fpath, newline='', encoding="latin1") as messages_file:
    lines = csv.DictReader(messages_file)
    for line in lines:
        lines_read += 1
        answer = line['v1']
        message = line['v2']
        sender = is_business(message)
        # update counters for true/false positive/negatives
        if sender==True:
            if answer=="business":
                true_positive += 1
            else:
                false_positive += 1
        else:
            if answer=="personal":
                true_negative += 1
            else:
                false_negative += 1
        if lines_read==cutoff_num:
            break
# output results, calculate relevant measures
accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
precision = true_positive/(true_positive + false_positive)
recall = true_positive/(true_positive + false_negative)
print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))
        








In [19]:
import csv
import string

def tokenize(message):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    message = message.strip()
    for num in string.digits:
        message = message.replace(num, "")
    for punc in string.punctuation:
        if punc!="'": # preserve contractions
            message = message.replace(punc, "")
    processed = message.split(" ")
    tokens = []
    for token in processed:
        if token!='':
            tokens.append(token.lower())
    return tokens


# main function for classifying messages
def calculate_index(message, matrix, availableTokens):
    '''calculates 'formality index' of a given message'''
    tokens = tokenize(message)
    #print(tokens)
    sum = 0
    tokenCount = 0
    for token in tokens:
        if token in availableTokens:
            sum += matrix[token][0]/(matrix[token][0] + matrix[token][1])
            tokenCount += 1 
    if tokenCount == 0:
        return 0
    return sum/tokenCount # average frequency of words

# uses index to return True (for business) or False (for personal)
def is_business(index, cutoff):
    '''classifies given message as business or personal'''
    if index>=cutoff:
        return True
    else:
        return False

# main code to check messages
english_words = get_words("/Users/anisha/Downloads/sms-messages/dictionary_2.txt")
business_words = get_words("/Users/anisha/Downloads/sms-messages/business.txt")
informal_words = get_words("/Users/anisha/Downloads/sms-messages/informal.txt")
fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
lines_read = 0
matrix = {}
wordsSeen = set()
cutoff_num = 5000
# go through set number of messages
with open(fpath, newline='', encoding="latin1") as messages_file:
    lines = csv.DictReader(messages_file)
    for line in lines:
        lines_read += 1
        answer = line['v1']
        message = line['v2']
        if lines_read < 4000:
            # frequency train on first 4000
            tokens = tokenize(message)
            if answer=="business":
                use = 0
            else:
                use = 1
            for token in tokens:
                if token not in wordsSeen:
                    wordsSeen.add(token)
                    matrix[token] = [0, 0]
                matrix[token][use] += 1
        else:
            # test on last 1000
            score = calculate_index(message, matrix, wordsSeen)
            sender = is_business(score, 0.5)
            #print(message, score, answer)
            # update counters for true/false positive/negatives
            if sender==True:
                if answer=="business":
                    true_positive += 1
                else:
                    false_positive += 1
            else:
                if answer=="personal":
                    true_negative += 1
                else:
                    false_negative += 1
        if lines_read==cutoff_num:
            break
    # output results, calculate relevant measures
    accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
    print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))
        








63 true positives, 7 false positives, 892 true negatives, 39 false negatives
accuracy 0.954045954045954, precision 0.9, recall 0.6176470588235294


In [21]:
import csv
import string

def tokenize(message):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    message = message.strip()
    for num in string.digits:
        message = message.replace(num, "")
    for punc in string.punctuation:
        if punc!="'": # preserve contractions
            message = message.replace(punc, "")
    processed = message.split(" ")
    tokens = []
    for token in processed:
        if token!='':
            tokens.append(token.lower())
    return tokens


# main function for classifying messages
def calculate_index(message, matrix, availableTokens):
    '''calculates 'formality index' of a given message'''
    tokens = tokenize(message)
    #print(tokens)
    sum = 0
    tokenCount = 0
    for token in tokens:
        if token in availableTokens:
            sum += matrix[token][0]/(matrix[token][0] + matrix[token][1])
            tokenCount += 1 
    if tokenCount == 0:
        return 0
    return sum/tokenCount # average frequency of words

# uses index to return True (for business) or False (for personal)
def is_business(index, cutoff):
    '''classifies given message as business or personal'''
    if index>=cutoff:
        return True
    else:
        return False

def process_messages(cutoff):
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    lines_read = 0
    matrix = {}
    wordsSeen = set()
    fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
    with open(fpath, newline='', encoding="latin1") as messages_file:
        lines = csv.DictReader(messages_file)
        for line in lines:
            lines_read += 1
            answer = line['v1']
            message = line['v2']
            if lines_read < 4000:
                tokens = tokenize(message)
                if answer=="business":
                    use = 0
                else:
                    use = 1
                for token in tokens:
                    if token not in wordsSeen:
                        wordsSeen.add(token)
                        matrix[token] = [0, 0]
                    matrix[token][use] += 1
            else:
                sender = is_business(calculate_index(message, matrix, wordsSeen), cutoff)
                # update counters for true/false positive/negatives
                if sender==True:
                    if answer=="business":
                        true_positive += 1
                    else:
                        false_positive += 1
                else:
                    if answer=="personal":
                        true_negative += 1
                    else:
                        false_negative += 1
    # output results, calculate relevant measures
    accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
    print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))


# try different cutoff scores
for i in range(30, 70, 5):
    print("cutoff: " + str(i/100))
    process_messages(i/100)
    
    
        








cutoff: 0.3
133 true positives, 379 false positives, 814 true negatives, 0 false negatives
accuracy 0.7141779788838613, precision 0.259765625, recall 1.0
cutoff: 0.35
133 true positives, 182 false positives, 1011 true negatives, 0 false negatives
accuracy 0.8627450980392157, precision 0.4222222222222222, recall 1.0
cutoff: 0.4
131 true positives, 84 false positives, 1109 true negatives, 2 false negatives
accuracy 0.9351432880844646, precision 0.6093023255813953, recall 0.9849624060150376
cutoff: 0.45
124 true positives, 27 false positives, 1166 true negatives, 9 false negatives
accuracy 0.9728506787330317, precision 0.8211920529801324, recall 0.9323308270676691
cutoff: 0.5
85 true positives, 10 false positives, 1183 true negatives, 48 false negatives
accuracy 0.9562594268476622, precision 0.8947368421052632, recall 0.6390977443609023
cutoff: 0.55
33 true positives, 4 false positives, 1189 true negatives, 100 false negatives
accuracy 0.9215686274509803, precision 0.8918918918918919, rec

ZeroDivisionError: division by zero

In [6]:
# Naive Bayes algorithm

import csv
import string

def tokenize(message):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    message = message.strip()
    for num in string.digits:
        message = message.replace(num, "")
    for punc in string.punctuation:
        if punc!="'": # preserve contractions
            message = message.replace(punc, "")
    processed = message.split(" ")
    tokens = []
    for token in processed:
        if token!='':
            tokens.append(token.lower())
    return tokens


# main function for classifying messages
def calculate_probability(message, isBusiness):
    '''calculates probability all words of a given message are business/personal'''
    global matrix
    global allTokens 
    global businessTokens 
    global personalTokens
    tokens = tokenize(message)
    #print(tokens)
    prob = 1
    tokenCount = 0
    if isBusiness:
        categoryID = 0
        category = businessTokens
    else:
        categoryID = 1
        category = personalTokens
    for token in tokens:
        if token in category:
            category_appearances = matrix[token][categoryID]
        else:
            category_appearances = 0
        prob *= (category_appearances + 1)/(len(category) + len(allTokens))
    return prob # average frequency of words

# uses index to return True (for business) or False (for personal)
def is_business(message):
    '''classifies given message as business or personal'''
    bus_prob = calculate_probability(message, True)
    pers_prob = calculate_probability(message, False)
    return bus_prob > pers_prob

# main code to check messages
fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
lines_read = 0
matrix = {}
allTokens = set()
businessTokens = set()
personalTokens = set()
cutoff_num = 5000
# go through set number of messages
with open(fpath, newline='', encoding="latin1") as messages_file:
    lines = csv.DictReader(messages_file)
    for line in lines:
        lines_read += 1
        answer = line['v1']
        message = line['v2']
        if lines_read < 4000:
            # frequency train on first 4000
            tokens = tokenize(message)
            if answer=="business":
                use = 0
            else:
                use = 1
            for token in tokens:
                if token not in allTokens:
                    allTokens.add(token)
                    matrix[token] = [0, 0]
                matrix[token][use] += 1
                if use==0 and token not in businessTokens:
                    businessTokens.add(token)
                elif use==1 and token not in personalTokens:
                    personalTokens.add(token)
        else:
            # test on last 1000
            sender = is_business(message)
            #print(message, sender, answer)
            # update counters for true/false positive/negatives
            if sender==True:
                if answer=="business":
                    true_positive += 1
                else:
                    false_positive += 1
            else:
                if answer=="personal":
                    true_negative += 1
                else:
                    false_negative += 1
        if lines_read==cutoff_num:
            break
    # output results, calculate relevant measures
    accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
    print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))


    
    
        








58 true positives, 12 false positives, 887 true negatives, 44 false negatives
accuracy 0.9440559440559441, precision 0.8285714285714286, recall 0.5686274509803921


In [24]:
# Naive Bayes algorithm with stopwords removed

import csv
import string

# helper functions
def contains(inputList, element):
  '''uses binary search to find if a given list contains a certain element'''
  low = 0
  high = len(inputList)
  while low < high:
    mid = (low+high)//2
    #print(low, high, mid)
    #print(inputList[mid])
    if inputList[mid] == element:
      return True
    elif inputList[mid] < element:
      low = mid + 1
    else:
      high = mid
  return False
  
def get_words(fname):
    '''returns list of processed [no whitespace/capitalization] lines of text from given file'''
    f = open(fname)
    lines_original = f.readlines()
    lines_processed = []
    for line in lines_original:
        lines_processed.append(line.strip().lower())
    f.close()
    lines_processed.sort()
    return lines_processed

def tokenize(message):
    '''breaks message down into list of words without punctuation, whitespace, etc.'''
    message = message.strip()
    #print(message)
    for num in string.digits:
        message = message.replace(num, "")
    for punc in string.punctuation:
        if punc!="'": # preserve contractions
            message = message.replace(punc, "")
    processed = message.split()
    #print(processed)
    stopwords = get_words("/Users/anisha/Downloads/sms-messages/stopwords.txt")
    #print(stopwords)
    tokens = []
    for token in processed:
        if token!='' and not contains(stopwords, token.lower()):
            tokens.append(token.lower())
    #print(tokens)
    return tokens


# main function for classifying messages
def calculate_probability(message, isBusiness):
    '''calculates probability all words of a given message are business/personal'''
    global matrix
    global allTokens 
    global businessTokens 
    global personalTokens
    tokens = tokenize(message)
    #print(tokens)
    prob = 1
    tokenCount = 0
    if isBusiness:
        categoryID = 0
        category = businessTokens
    else:
        categoryID = 1
        category = personalTokens
    for token in tokens:
        if token in category:
            category_appearances = matrix[token][categoryID]
        else:
            category_appearances = 0
        prob *= (category_appearances + 1)/(len(category) + len(allTokens))
    return prob # average frequency of words

# uses index to return True (for business) or False (for personal)
def is_business(message):
    '''classifies given message as business or personal'''
    bus_prob = calculate_probability(message, True)
    pers_prob = calculate_probability(message, False)
    return bus_prob > pers_prob

# main code to check messages

fpath = "/Users/anisha/Downloads/sms-messages/personal-biz.csv"
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
lines_read = 0
matrix = {}
allTokens = set()
businessTokens = set()
personalTokens = set()
cutoff_num = 5000
# go through set number of messages
with open(fpath, newline='', encoding="latin1") as messages_file:
    lines = csv.DictReader(messages_file)
    for line in lines:
        lines_read += 1
        answer = line['v1']
        message = line['v2']
        if lines_read < 4000:
            # frequency train on first 4000
            tokens = tokenize(message)
            if answer=="business":
                use = 0
            else:
                use = 1
            for token in tokens:
                if token not in allTokens:
                    allTokens.add(token)
                    matrix[token] = [0, 0]
                matrix[token][use] += 1
                if use==0 and token not in businessTokens:
                    businessTokens.add(token)
                elif use==1 and token not in personalTokens:
                    personalTokens.add(token)
        else:
            # test on last 1000
            sender = is_business(message)
            #print(message, sender, answer)
            # update counters for true/false positive/negatives
            if sender==True:
                if answer=="business":
                    true_positive += 1
                else:
                    false_positive += 1
            else:
                if answer=="personal":
                    true_negative += 1
                else:
                    false_negative += 1
        if lines_read==cutoff_num:
            break
    # output results, calculate relevant measures
    accuracy = (true_positive + true_negative)/(true_positive + false_positive + true_negative + false_negative)
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    print("{} true positives, {} false positives, {} true negatives, {} false negatives".format(true_positive, false_positive, true_negative, false_negative))
    print("accuracy {}, precision {}, recall {}".format(accuracy, precision, recall))


    
    
        








82 true positives, 27 false positives, 872 true negatives, 20 false negatives
accuracy 0.9530469530469531, precision 0.7522935779816514, recall 0.803921568627451
