# EECS 498 - Assignment 4 - Dialog Act Classification
### By: Alexander "AJ" Goldstein - uniquename: ajva

In [104]:
import csv
import sys
import pandas as pd
import scipy
import numpy as np
import math
from __future__ import division
import random
from random import shuffle
import matplotlib.pyplot as plt
import operator

## Pre-processing Functions:

### 1) establish possible senses for current word disambiguation

In [120]:
def establish_dialogs(dataFile):
    
    dialogs_dict = {}
    dialogs_features_dict = {}
    total_instances = 0
    total_dialogs = 0
    
    with open(dataFile) as data:
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        for line in data:
            
            # if it's the start of a new instance...
            if line.find("Advisor") != -1:
                total_instances += 1
                
                dialog_act = line.split(" ")[1]
                if dialog_act.find('[') == 0:
                    
                    # store or index dialog act (depending on if it's new)
                    if dialog_act not in dialogs_dict:
                        total_dialogs += 1
                        dialogs_dict[dialog_act] = 1
                        dialogs_features_dict[dialog_act] = {}
                    else:
                        dialogs_dict[dialog_act] += 1
        
    return dialogs_dict, dialogs_features_dict, total_instances, total_dialogs

### 2) parse the training folds for data counts

In [121]:
def parse_datafile(dataFile, dialogs_dict, dialogs_features_dict):
    
    stripList = ['.', '(', ')', ',', '-', '!', '?']
    
    with open(dataFile) as data:
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        prev_line = ''
        for line in data:
            
            # if it's the start of a new instance...
            if line.find("Advisor") != -1:
                
                # capture the advisor's dialog act
                dialog_act = line.split(" ")[1]
                if dialog_act.find('[') == 0:
                        
                    # get the features from the previous line
                    features = prev_line.strip('\n')
                    if features.find("Student:") == 0:
                        
                        # loop through each word in the features
                        features_split = features.split(' ')
                        for idx in range(1, len(features_split)):
                            
                            # strip excess puntuation
                            for stripItem in stripList:
                                features_split[idx] = features_split[idx].strip(stripItem)
                            features_split[idx] = features_split[idx].lower().strip()
                        
                            # store all new words as "present"
                            if features_split[idx] not in dialogs_features_dict[dialog_act]:
                                dialogs_features_dict[dialog_act][features_split[idx]] = 1.0
            
            # set previous line for next go-around
            prev_line = line
                                    
    return dialogs_dict, dialogs_features_dict

### 3) make predictions on test fold instances

In [149]:
def predict_instances(testFile, outputFile, dialogs_dict, dialogs_features_dict):
    
    stripList = ['.', '(', ')', ',', '-', '!', '?']
    
    with open(testFile) as data:
        test_count = 0
        correct_count = 0
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        prev_line = ''
        for line in data:
            
            # if it's the start of a new instance...
            if line.find("Advisor") != -1:
                test_count += 1
                
                # capture the advisor's dialog act
                dialog_act = line.split(" ")[1]
                true_dialog = dialog_act
                if dialog_act.find('[') == 0:
                        
                    # get the features from the previous line
                    features = prev_line.strip('\n')
                    features_split = []
                    if features.find("Student:") == 0:
                        
                        # loop through each word in the features
                        features_split = features.split(' ')
                        for idx in range(1, len(features_split)):
                            
                            # strip excess puntuation
                            for stripItem in stripList:
                                features_split[idx] = features_split[idx].strip(stripItem)
                            features_split[idx] = features_split[idx].lower().strip()
                        
                            # store all new words as "absent"
                            if features_split[idx] not in dialogs_features_dict[dialog_act]:
                                for dialog in dialogs_dict:
                                    dialogs_features_dict[dialog_act][features_split[idx]] = 0.01
                                
                    
                    # calculate the argmax (probabilities for each dialog)
                    pred_probs = {}
                    for dialog in dialogs_dict:
                        pred_probs[dialog] = 1.0
                        
                        # factor in conditional probability for each word
                        for word in features_split:
                            
                            # use all words except 'Student:' as features
                            if (word != 'Student:'):
                                
                                # store as absent if not already
                                if word not in dialogs_features_dict[dialog]:
                                    dialogs_features_dict[dialog][word] = 0.01
                                    
                                # calculate & factor-in feature probabilities
                                feat_prob = dialogs_features_dict[dialog][word]/dialogs_dict[dialog]
                                pred_probs[dialog] = pred_probs[dialog] * feat_prob
                        
                        # calculate & factor-in dialog probability
                        dialog_prob = dialogs_dict[dialog]/sum(dialogs_dict.values())
                        pred_probs[dialog] = pred_probs[dialog]*dialog_prob
                        #print(dialog, pred_probs[dialog])
                    
                    # identify the dialog with the highest probability
                    pred_dialog = max(pred_probs.items(), key=operator.itemgetter(1))[0]
                    
                    # check if prediction is correct
                    if pred_dialog == true_dialog:
                        correct_count += 1

                    # output prediction
                    outputFile.write(prev_line)
                    outputFile.write(pred_dialog + ' ' + line)
                
            # set previous line for next go-around
            prev_line = line
            
    accuracy = float(correct_count/test_count)
    print('Accuracy:' + str(accuracy))
    
    return accuracy

## MAIN FUNCTION :

### 1) read in filename

In [150]:
# SCRIPT NOTE: switch out arguments

# read in file
dataFile = "DialogAct.train"
testFile = "DialogAct.test"

#dataFile = sys.argv[1]
#testFile = sys.argv[2]

### 2) create output file

In [151]:
# create output file: <word>.wsd.out
outputName = "DialogAct.test.out"
outputFile = open(outputName, "w")

### 3) establish dialogs

In [152]:
dialogs, features, total_instances, total_dialogs = establish_dialogs(dataFile)

In [153]:
print('total instances:', total_instances)
print('total dialogs:', total_dialogs)

('total instances:', 2796)
('total dialogs:', 14)


### 4) parse training file for counts

In [154]:
dialogs_dict, dialogs_features_dict = parse_datafile(dataFile, dialogs, features)

### 5) predict dialog acts for test data

In [155]:
accuracy = predict_instances(testFile, outputFile, dialogs_dict, dialogs_features_dict)

Accuracy:0.0990502035278
