## Politeness Classifiers

### Factors outlined as contributing to politeness ratings for the data examples:

Direct Questions

Factuality

Please

Hedging

Counterfactual

Deference

#### TODO: Implement features into classifier

In [1]:
import csv

labels = ['ID', 'Message', 'NS', 'NNS']
filenames = ["BinaryLabeling.csv", "StrongNeutralLabeling.csv",
             "WeakNeutralLabeling.csv", "IntermediateLabeling.csv"]
fileobjs = [open("LabeledData/" + i, "r") for i in filenames]
readers = [csv.reader(i) for i in fileobjs]

## Baseline Classifier: Unigrams

This will be a baseline classifier for our labeling schemes, using a simple Bag of Words approach to determine labels based purely off of words present in a sample.

In [2]:
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from collections import Counter

# Create featureset from all individual words in training
next(readers[0], None)
num_train = 850 # Training comes from first 850 of 1000 samples
all_words = set()
for row in readers[0]:
    if num_train <= 0:
        break;
    line = word_tokenize(row[1])
    for word in line:
        all_words.add(word)
    num_train -= 1
fileobjs[0].seek(0)

def bag_of_words(sentence):
    d = dict.fromkeys(all_words, 0)
    c = Counter(word_tokenize(sentence))
    for i in c:
        d[i] = c[i]
    return d

NB_classifiers_NS = []
NB_classifiers_NNS = []
NB_tests_NS = []
NB_tests_NNS = []
for i in readers:
    next(i, None)
    all_data = list(i)
    train_NS = [(bag_of_words(row[1]), row[2]) for row in all_data[:850]]
    train_NNS = [(bag_of_words(row[1]), row[3]) for row in all_data[:850]]
    NB_tests_NS.append([(bag_of_words(row[1]), row[2]) for row in all_data[850:]])
    NB_tests_NNS.append([(bag_of_words(row[1]), row[3]) for row in all_data[850:]])

    NB_classifiers_NS.append(NaiveBayesClassifier.train(train_NS))
    NB_classifiers_NNS.append(NaiveBayesClassifier.train(train_NNS))

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(accuracy(NB_classifiers_NS[i], NB_tests_NS[i]))
    print("non-native speaker:")
    print(accuracy(NB_classifiers_NNS[i], NB_tests_NNS[i]))

BinaryLabeling.csv
native speaker:
0.6666666666666666
non-native speaker:
0.7133333333333334
StrongNeutralLabeling.csv
native speaker:
0.48
non-native speaker:
0.48
WeakNeutralLabeling.csv
native speaker:
0.7266666666666667
non-native speaker:
0.64
IntermediateLabeling.csv
native speaker:
0.44
non-native speaker:
0.44


From here we see our less expressive labeling schemes lead to higher accuracies, even with a classifier that doesn't take into account other features at all.

## Baseline Classifier: Base Prediction Model

Per the slides, we want to build a logistic regression model using three main measures:
perspective API scores (~ toxicity), readability measures, and length of sample

In [12]:
import requests
import textstat
import json
import time
from nltk import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier

# Variables for perspective API call
#headers and parameters for perspective api call
api_key = 'AIzaSyBaMPpybrBfyWF54hvkFK1QuEBPPKmQh8M'
url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +    
    '?key=' + api_key)

def features(sentence):
    d = {}
    d['readability'] = textstat.text_standard(sentence)
    d['length'] = len(word_tokenize(sentence))
    
    #preprocessing text to make readable for perspective api scores:
    text = ''
    for a in sentence:
        if a==' ' or (a<='Z' and a>='A') or (a<='z' and a>='a') or (a<='9' and a>='0') or a=='?' or a=='.':
            text +=a

    #perspective api scores call:
    data = '{comment: {text:"'+text+'"}, languages: ["en"], requestedAttributes: {TOXICITY:{}} }'
    response = requests.post(url=url, data=data)
    j = json.loads(response.content)
    try:
        d['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
    except:
        if 'error' in j: # API has call limits, will attempt to wait to bypass
            while 'error' in j:
                try:
                    d['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
                except:
                    time.sleep(5)
                    response = requests.post(url=url, data=data)
                    j = json.loads(response.content)
        else:
            d['toxicity'] = 0
    return d

for i in fileobjs:
    i.seek(0)

next(readers[0], None)
all_data = list(readers[0])
feature_data = {}
for row in all_data:
    feature_data[row[0]] = features(row[1])
fileobjs[0].seek(0)

L_classifiers_NS = []
L_classifiers_NNS = []
L_tests_NS = []
L_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    train_NS = [(feature_data[row[0]], row[2]) for row in list_data[:850]]
    train_NNS = [(feature_data[row[0]], row[3]) for row in list_data[:850]]
    L_tests_NS.append([(feature_data[row[0]], row[2]) for row in list_data[850:]])
    L_tests_NNS.append([(feature_data[row[0]], row[3]) for row in list_data[850:]])

    L_classifiers_NS.append(DecisionTreeClassifier.train(train_NS))
    L_classifiers_NNS.append(DecisionTreeClassifier.train(train_NNS))

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(accuracy(L_classifiers_NS[i], L_tests_NS[i]))
    print("non-native speaker:")
    print(accuracy(L_classifiers_NNS[i], L_tests_NNS[i]))

BinaryLabeling.csv
native speaker:
0.48
non-native speaker:
0.44
StrongNeutralLabeling.csv
native speaker:
0.32666666666666666
non-native speaker:
0.3333333333333333
WeakNeutralLabeling.csv
native speaker:
0.16666666666666666
non-native speaker:
0.6733333333333333
IntermediateLabeling.csv
native speaker:
0.28
non-native speaker:
0.3333333333333333
