## Politeness Classifiers

### Factors outlined as contributing to politeness ratings for the data examples:

Direct Questions

Factuality

Please

Hedging

Counterfactual

Deference

#### TODO: Ablate these features using those extracted from Politeness_Feedback

In [32]:
import csv

labels = ['ID', 'Message', 'NS', 'NNS']
filenames = ["BinaryLabeling.csv", "StrongNeutralLabeling.csv",
             "WeakNeutralLabeling.csv", "IntermediateLabeling.csv",
            "PartitionsLabeling.csv"]
fileobjs = [open("LabeledData/" + i, "r") for i in filenames]
readers = [csv.reader(i) for i in fileobjs]

## Baseline Classifier: Unigrams

This will be a baseline classifier for our labeling schemes, using a simple Bag of Words approach to determine labels based purely off of words present in a sample.

In [33]:
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from collections import Counter

# Create featureset from all individual words in training
next(readers[0], None)
num_train = 900 # Training comes from first 900 of 1000 samples
all_words = set()
for row in readers[0]:
    if num_train <= 0:
        break;
    line = word_tokenize(row[1])
    for word in line:
        all_words.add(word)
    num_train -= 1

# Using seek(0) resets reader
fileobjs[0].seek(0)

def bag_of_words(sentence):
    d = dict.fromkeys(all_words, 0)
    c = Counter(word_tokenize(sentence))
    for i in c:
        d[i] = c[i]
    return d

NB_classifiers_NS = []
NB_classifiers_NNS = []
NB_tests_NS = []
NB_tests_NNS = []
for i in readers:
    next(i, None)
    all_data = list(i)
    train_NS = [(bag_of_words(row[1]), row[2]) for row in all_data[:850]]
    train_NNS = [(bag_of_words(row[1]), row[3]) for row in all_data[:850]]
    NB_tests_NS.append([(bag_of_words(row[1]), row[2]) for row in all_data[850:]])
    NB_tests_NNS.append([(bag_of_words(row[1]), row[3]) for row in all_data[850:]])

    NB_classifiers_NS.append(NaiveBayesClassifier.train(train_NS))
    NB_classifiers_NNS.append(NaiveBayesClassifier.train(train_NNS))

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(accuracy(NB_classifiers_NS[i], NB_tests_NS[i]))
    print("non-native speaker:")
    print(accuracy(NB_classifiers_NNS[i], NB_tests_NNS[i]))

BinaryLabeling.csv
native speaker:
0.66
non-native speaker:
0.7066666666666667
StrongNeutralLabeling.csv
native speaker:
0.4866666666666667
non-native speaker:
0.4866666666666667
WeakNeutralLabeling.csv
native speaker:
0.7
non-native speaker:
0.5933333333333334
IntermediateLabeling.csv
native speaker:
0.5
non-native speaker:
0.4866666666666667
PartitionsLabeling.csv
native speaker:
0.2866666666666667
non-native speaker:
0.3466666666666667


## Baseline Classifier: Base Prediction Model

Per the slides, we want to build a logistic regression model using three main measures:
perspective API scores (~ toxicity), readability measures, and length of sample

### Issue with the perspective API scores:

The API has a limited amount of queries per minute for our feature collection. To combat this, a loop has been put in that waits when such an error occurs. However, this means the featureset of the data takes a very large amount of time because of all the waiting around we have to do.

In [3]:
import requests
import re
import textstat
import json
import time

# Variables for perspective API call
# headers and parameters for perspective api call
api_key = 'AIzaSyBaMPpybrBfyWF54hvkFK1QuEBPPKmQh8M'
url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +    
    '?key=' + api_key)

# Since readability returns string of form "xth to (x+1)th grade",
# we should only grab the first one.
def find_first_num(s):
    i = re.search('[0-9]+', s).group()
    return int(i)

def features(sentence):
    d = {}
    d['readability'] = find_first_num(textstat.text_standard(sentence))
    d['length'] = len(word_tokenize(sentence))
    
    # preprocessing text to make readable for perspective api scores:
    text = ''
    for a in sentence:
        if a==' ' or (a<='Z' and a>='A') or (a<='z' and a>='a') or (a<='9' and a>='0') or a=='?' or a=='.':
            text +=a

    # perspective api scores call:
    data = '{comment: {text:"'+text+'"}, languages: ["en"], requestedAttributes: {TOXICITY:{}} }'
    response = requests.post(url=url, data=data)
    j = json.loads(response.content)
    # attempting to deal with API issues
    while 'error' in j:
        time.sleep(5)
        response = requests.post(url=url, data=data)
        j = json.loads(response.content)
    try:
        d['toxicity'] = float(j['attributeScores']['TOXICITY']['summaryScore']['value'])
    except:
        d['toxicity'] = 0.0
    assert(len(d.values()) == 3)
    return d

fileobjs[0].seek(0)
# Creating feature dict for each sample in dataset
next(readers[0], None)
all_data = list(readers[0])
feature_data = {}
for row in all_data:
    feature_data[row[0]] = features(row[1])
fileobjs[0].seek(0)


0

In [4]:
import numpy
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def data_process(num_features):
    # Creating matrix of (samples, features) for sklearn models
    feature_matrix = []
    for i in range(1,1001):
        feature_matrix.append(list(feature_data[str(i)].values()))
    feature_matrix = numpy.array([numpy.array(x) for x in feature_matrix])
    for i in feature_matrix:
        if len(i) != num_features:
            print(i) # debugging in case perspective api fails
    return numpy.stack(feature_matrix, axis=0)

In [5]:
for i in fileobjs:
    i.seek(0)

feature_matrix = data_process(3)

L_classifiers_NS = []
L_classifiers_NNS = []
L_tests_NS = []
L_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    # Easier to use DataFrame obj to work with skl models
    data_NS=pd.DataFrame({
        'readability':feature_matrix[:,0],
        'length':feature_matrix[:,1],
        'toxicity':feature_matrix[:,2],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['readability', 'length', 'toxicity']]

    # NS training
    # Splitting up into 90% training, 10% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    L_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    L_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    L_classifiers_NS.append(clfNS)
    L_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(L_classifiers_NS[i].score(L_tests_NS[i][0], L_tests_NS[i][1]))
    print("non-native speaker:")
    print(L_classifiers_NNS[i].score(L_tests_NNS[i][0], L_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.64
non-native speaker:
0.6
StrongNeutralLabeling.csv
native speaker:
0.43
non-native speaker:
0.39
WeakNeutralLabeling.csv
native speaker:
0.72
non-native speaker:
0.72
IntermediateLabeling.csv
native speaker:
0.61
non-native speaker:
0.57
PartitionsLabeling.csv
native speaker:
0.19
non-native speaker:
0.32


## Preliminary Observations

A naive hypothesis would assume higher accuracy for less expressive labeling schemes, but this does not always seem to be the case.

In terms of accuracy, we have our Weak Neutral with the highest and Strong Neutral at the lowest. What is interesting is that the Binary and Intermediate Labeling schemes have very similar accuracies, despite being farthest apart in terms of expressiveness.

### A big deciding factor of which labeling schema has the highest accuracy, appears to be how 'neutral' is expressed.

EDIT: after adding partitions-based labeling, it seems to have the lowest accuracy, decreasing as we move from the Naive Bayes Classifier to Random Forest.

## Adding Additional Features

### Adding in politeness score (from work by Prof. Danescu-Niculescu-Mizil)

We are importing code from another repo focused on measuring politeness on emails.

Possible issues with this approach:

- Does not give a singular value measuring both politeness and impoliteness. Splitting up the scoring of a text into a separate politeness and impoliteness score might skew model results.

- Words labeled as "negative" or "profane" can often be too generally applied, as they might be contained in the text but not in an offensive context. For example, the word "black" can be offensive in a racial context, but is often used just as a color for inanimate objects.

To remedy this, the features that go into this calculation have been extracted and added into the feedback:

- Please start: Sentence beginning with 'Please'.
- 1st person pl.: Sentence using the first-person plural after the start ('us' or 'we').
- Deference: Showing praise or compliment.
- SUBJUNCTIVE: Presence of subjunctive tense.
- 1st person start: Sentence starts with the first person pronoun.
- Factuality: Discussing of events in a factual manner (e.g. 'in fact').
- Hedges: Displaying hesitation or uncertainty (e.g. 'maybe' or 'suggest').
- HASNEGATIVE: Uses a negative adverb or adjective.
- Direct start: Directly delivers main topic or request (e.g. "So, can you do this for me?").
- 1st person: Using the first person after the start.
- Direct question: Addresses audience directly to ask something.
- Apologizing: Saying sorry or offering submissive sincerity.
- Indirect (greeting): Contains formal or informal greetings (e.g. 'Hey').
- 2nd person start: Sentence starts with a second person pronoun.
- HASPOSITIVE: Contains a very positive word.
- INDICATIVE: Presents a statement using indicative modals like 'can' or 'will.
- Please: Using the word 'please' after the start.
- Gratitude: Expressing appreciation.
- HASHEDGE: Contains hedging (very similar to other feature).
- 2nd person: Contains second person pronouns after the start.

In [20]:
from Politeness_Feedback.utils import *
from Politeness_Feedback.politeness.api_util import get_scores_strategies_token_indices

fileobjs[0].seek(0)
# Adding impolite and polite scores into model
next(readers[0], None)
all_data = list(readers[0])
p_feat_set = set()
for row in all_data:
    s = get_scores_strategies_token_indices(row[1])['strategies']
    for i in s:
        p_feat_set.add(i)
print(p_feat_set)

fileobjs[0].seek(0)
next(readers[0], None)
all_data = list(readers[0])
for row in all_data:
    r = score_text(row[1])
    # Adding impolite and polite scores into model
    feature_data[row[0]]['score_impolite'] = r[1]
    feature_data[row[0]]['score_polite'] = r[2]
    # Adding extracted features into model
    s = get_scores_strategies_token_indices(row[1])['strategies']
    for i in p_feat_set:
        feature_data[row[0]][i] = 1 if i in s else 0

{'Please start', '1st person pl.', 'Deference', 'SUBJUNCTIVE', '1st person start', 'Factuality', 'Hedges', 'HASNEGATIVE', 'Direct start', '1st person', 'Direct question', 'Apologizing', 'Indirect (greeting)', '2nd person start', 'HASPOSITIVE', 'INDICATIVE', 'Please', 'Gratitude', 'HASHEDGE', '2nd person'}


In [21]:
for i in fileobjs:
    i.seek(0)

feature_matrix = data_process(25)

P_classifiers_NS = []
P_classifiers_NNS = []
P_tests_NS = []
P_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    # Adding huge amount of features to DataFrame
    data_NS=pd.DataFrame({
        'readability':feature_matrix[:,0],
        'length':feature_matrix[:,1],
        'toxicity':feature_matrix[:,2],
        'score_impolite':feature_matrix[:,3],
        'score_polite':feature_matrix[:,4],
        'Please start':feature_matrix[:,5],
        '1st person pl.':feature_matrix[:,6],
        'Deference':feature_matrix[:,7],
        'SUBJUNCTIVE':feature_matrix[:,8],
        '1st person start':feature_matrix[:,9],
        'Factuality':feature_matrix[:,10],
        'Hedges':feature_matrix[:,11],
        'HASNEGATIVE':feature_matrix[:,12],
        'Direct start':feature_matrix[:,13],
        '1st person':feature_matrix[:,14],
        'Direct question':feature_matrix[:,15],
        'Apologizing':feature_matrix[:,16],
        'Indirect (greeting)':feature_matrix[:,17],
        '2nd person start':feature_matrix[:,18],
        'HASPOSITIVE':feature_matrix[:,19],
        'INDICATIVE':feature_matrix[:,20],
        'Please':feature_matrix[:,21],
        'Gratitude':feature_matrix[:,22],
        'HASHEDGE':feature_matrix[:,23],
        '2nd person':feature_matrix[:,24],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['readability', 'length', 'toxicity', 'score_impolite', 'score_polite',
               'Please start', '1st person pl.', 'Deference', 'SUBJUNCTIVE', '1st person start',
               'Factuality', 'Hedges', 'HASNEGATIVE', 'Direct start', '1st person', 'Direct question',
               'Apologizing', 'Indirect (greeting)', '2nd person start', 'HASPOSITIVE', 'INDICATIVE',
               'Please', 'Gratitude', 'HASHEDGE', '2nd person']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    P_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    P_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    P_classifiers_NS.append(clfNS)
    P_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(P_classifiers_NS[i].score(P_tests_NS[i][0], P_tests_NS[i][1]))
    print("non-native speaker:")
    print(P_classifiers_NNS[i].score(P_tests_NNS[i][0], P_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.78
non-native speaker:
0.68
StrongNeutralLabeling.csv
native speaker:
0.54
non-native speaker:
0.48
WeakNeutralLabeling.csv
native speaker:
0.75
non-native speaker:
0.68
IntermediateLabeling.csv
native speaker:
0.56
non-native speaker:
0.54
PartitionsLabeling.csv
native speaker:
0.31
non-native speaker:
0.28


## Ablation Study

Given our current 4 features, we will be experimenting with taking them away one-at-a-time and retraining our models to see which ones are actually useful.

### Removing Reading Level

Because the subjects were adults with high levels of English comprehension (even non-native speakers), we hypothesize that removing this feature will not remove model accuracy.

In [23]:
for i in fileobjs:
    i.seek(0)

no_read_classifiers_NS = []
no_read_classifiers_NNS = []
no_read_tests_NS = []
no_read_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    data_NS=pd.DataFrame({
        'length':feature_matrix[:,1],
        'toxicity':feature_matrix[:,2],
        'score_impolite':feature_matrix[:,3],
        'score_polite':feature_matrix[:,4],
        'Please start':feature_matrix[:,5],
        '1st person pl.':feature_matrix[:,6],
        'Deference':feature_matrix[:,7],
        'SUBJUNCTIVE':feature_matrix[:,8],
        '1st person start':feature_matrix[:,9],
        'Factuality':feature_matrix[:,10],
        'Hedges':feature_matrix[:,11],
        'HASNEGATIVE':feature_matrix[:,12],
        'Direct start':feature_matrix[:,13],
        '1st person':feature_matrix[:,14],
        'Direct question':feature_matrix[:,15],
        'Apologizing':feature_matrix[:,16],
        'Indirect (greeting)':feature_matrix[:,17],
        '2nd person start':feature_matrix[:,18],
        'HASPOSITIVE':feature_matrix[:,19],
        'INDICATIVE':feature_matrix[:,20],
        'Please':feature_matrix[:,21],
        'Gratitude':feature_matrix[:,22],
        'HASHEDGE':feature_matrix[:,23],
        '2nd person':feature_matrix[:,24],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['length', 'toxicity', 'score_impolite', 'score_polite', 'Please start',
               '1st person pl.', 'Deference', 'SUBJUNCTIVE', '1st person start', 'Factuality',
               'Hedges', 'HASNEGATIVE', 'Direct start', '1st person', 'Direct question',
               'Apologizing', 'Indirect (greeting)', '2nd person start', 'HASPOSITIVE', 'INDICATIVE',
               'Please', 'Gratitude', 'HASHEDGE', '2nd person']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    no_read_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    no_read_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    no_read_classifiers_NS.append(clfNS)
    no_read_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(no_read_classifiers_NS[i].score(no_read_tests_NS[i][0], no_read_tests_NS[i][1]))
    print("non-native speaker:")
    print(no_read_classifiers_NNS[i].score(no_read_tests_NNS[i][0], no_read_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.66
non-native speaker:
0.67
StrongNeutralLabeling.csv
native speaker:
0.54
non-native speaker:
0.58
WeakNeutralLabeling.csv
native speaker:
0.76
non-native speaker:
0.75
IntermediateLabeling.csv
native speaker:
0.56
non-native speaker:
0.56
PartitionsLabeling.csv
native speaker:
0.3
non-native speaker:
0.36


### Removing Document Length

We assert the experiment controlling, on average, attention spans of participants (both native and non-native). Coupled with our previous assumption on high levels of English from all participants, we hypothesize document length has a negligible effect on politeness ratings.

In [24]:
for i in fileobjs:
    i.seek(0)

no_len_classifiers_NS = []
no_len_classifiers_NNS = []
no_len_tests_NS = []
no_len_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    data_NS=pd.DataFrame({
        'toxicity':feature_matrix[:,2],
        'score_impolite':feature_matrix[:,3],
        'score_polite':feature_matrix[:,4],
        'Please start':feature_matrix[:,5],
        '1st person pl.':feature_matrix[:,6],
        'Deference':feature_matrix[:,7],
        'SUBJUNCTIVE':feature_matrix[:,8],
        '1st person start':feature_matrix[:,9],
        'Factuality':feature_matrix[:,10],
        'Hedges':feature_matrix[:,11],
        'HASNEGATIVE':feature_matrix[:,12],
        'Direct start':feature_matrix[:,13],
        '1st person':feature_matrix[:,14],
        'Direct question':feature_matrix[:,15],
        'Apologizing':feature_matrix[:,16],
        'Indirect (greeting)':feature_matrix[:,17],
        '2nd person start':feature_matrix[:,18],
        'HASPOSITIVE':feature_matrix[:,19],
        'INDICATIVE':feature_matrix[:,20],
        'Please':feature_matrix[:,21],
        'Gratitude':feature_matrix[:,22],
        'HASHEDGE':feature_matrix[:,23],
        '2nd person':feature_matrix[:,24],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['toxicity', 'score_impolite', 'score_polite', 'Please start',
               '1st person pl.', 'Deference', 'SUBJUNCTIVE', '1st person start', 'Factuality',
               'Hedges', 'HASNEGATIVE', 'Direct start', '1st person', 'Direct question',
               'Apologizing', 'Indirect (greeting)', '2nd person start', 'HASPOSITIVE', 'INDICATIVE',
               'Please', 'Gratitude', 'HASHEDGE', '2nd person']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    no_len_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    no_len_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    no_len_classifiers_NS.append(clfNS)
    no_len_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(no_len_classifiers_NS[i].score(no_len_tests_NS[i][0], no_len_tests_NS[i][1]))
    print("non-native speaker:")
    print(no_len_classifiers_NNS[i].score(no_len_tests_NNS[i][0], no_len_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.73
non-native speaker:
0.63
StrongNeutralLabeling.csv
native speaker:
0.51
non-native speaker:
0.51
WeakNeutralLabeling.csv
native speaker:
0.69
non-native speaker:
0.7
IntermediateLabeling.csv
native speaker:
0.55
non-native speaker:
0.6
PartitionsLabeling.csv
native speaker:
0.35
non-native speaker:
0.38


### Politeness vs Toxicity

Both API for these features should be measuring with some similarity. By comparing accuracy between models using only one or the other, how similar are the two measures? In addition, we are creating a fine-grained approach to our classifier by picking apart the extracted politeness features separately.

### Toxicity Only

In [27]:
for i in fileobjs:
    i.seek(0)

justT_classifiers_NS = []
justT_classifiers_NNS = []
justT_tests_NS = []
justT_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    data_NS=pd.DataFrame({
        'toxicity':feature_matrix[:,2],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['toxicity']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    justT_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    justT_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    justT_classifiers_NS.append(clfNS)
    justT_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(justT_classifiers_NS[i].score(justT_tests_NS[i][0], justT_tests_NS[i][1]))
    print("non-native speaker:")
    print(justT_classifiers_NNS[i].score(justT_tests_NNS[i][0], justT_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.61
non-native speaker:
0.56
StrongNeutralLabeling.csv
native speaker:
0.42
non-native speaker:
0.36
WeakNeutralLabeling.csv
native speaker:
0.78
non-native speaker:
0.7
IntermediateLabeling.csv
native speaker:
0.62
non-native speaker:
0.54
PartitionsLabeling.csv
native speaker:
0.31
non-native speaker:
0.22


### Just Politeness

In [28]:
for i in fileobjs:
    i.seek(0)

justP_classifiers_NS = []
justP_classifiers_NNS = []
justP_tests_NS = []
justP_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    data_NS=pd.DataFrame({
        'score_impolite':feature_matrix[:,3],
        'score_polite':feature_matrix[:,4],
        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['score_impolite', 'score_polite']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    justP_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    justP_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    justP_classifiers_NS.append(clfNS)
    justP_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(justP_classifiers_NS[i].score(justP_tests_NS[i][0], justP_tests_NS[i][1]))
    print("non-native speaker:")
    print(justP_classifiers_NNS[i].score(justP_tests_NNS[i][0], justP_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.66
non-native speaker:
0.62
StrongNeutralLabeling.csv
native speaker:
0.54
non-native speaker:
0.61
WeakNeutralLabeling.csv
native speaker:
0.73
non-native speaker:
0.66
IntermediateLabeling.csv
native speaker:
0.59
non-native speaker:
0.61
PartitionsLabeling.csv
native speaker:
0.31
non-native speaker:
0.3


### All Extracted Features

From here on, we want to ablate on these extracted features by category:

Directness:
- Please start
- 1st person start
- Direct start
- Direct question
- 2nd person start

Positive/Negative:
- HASPOSITIVE
- HASNEGATIVE
- Deference
- Gratitude
- Apologizing

(Un)certainty:
- SUBJUNCTIVE
- HASHEDGE
- INDICATIVE
- Factuality
- Hedges

Structural Ambiguity
- 1st person pl.
- 1st person
- Indirect (greeting)
- Please
- 2nd person

In [30]:
for i in fileobjs:
    i.seek(0)

extract_classifiers_NS = []
extract_classifiers_NNS = []
extract_tests_NS = []
extract_tests_NNS = []
for i in readers:
    next(i, None)
    list_data = list(i)
    labels_NS = [row[2] for row in list_data]
    labels_NNS = [row[3] for row in list_data]

    data_NS=pd.DataFrame({
        # Directness
        'Please start':feature_matrix[:,5],
        '1st person start':feature_matrix[:,9],
        'Direct start':feature_matrix[:,13],
        'Direct question':feature_matrix[:,15],
        '2nd person start':feature_matrix[:,18],

        # Pos/Neg
        'Deference':feature_matrix[:,7],
        'HASNEGATIVE':feature_matrix[:,12],
        'HASPOSITIVE':feature_matrix[:,19],
        'Gratitude':feature_matrix[:,22],
        'Apologizing':feature_matrix[:,16],
        
        # (Un)certainty
        'SUBJUNCTIVE':feature_matrix[:,8],
        'Factuality':feature_matrix[:,10],
        'Hedges':feature_matrix[:,11],
        'HASHEDGE':feature_matrix[:,23],
        'INDICATIVE':feature_matrix[:,20],
        
        # Structural ambig.
        '1st person':feature_matrix[:,14],
        '1st person pl.':feature_matrix[:,6],
        'Indirect (greeting)':feature_matrix[:,17],
        'Please':feature_matrix[:,21],
        '2nd person':feature_matrix[:,24],

        'politeness': numpy.array(labels_NS)
    })
    data_NS.head()
    data_NNS=pd.DataFrame({
        'politeness': numpy.array(labels_NNS)
    })
    data_NNS.head()
    X=data_NS[['Please start', '1st person pl.', 'Deference', 'SUBJUNCTIVE', '1st person start', 'Factuality',
               'Hedges', 'HASNEGATIVE', 'Direct start', '1st person', 'Direct question',
               'Apologizing', 'Indirect (greeting)', '2nd person start', 'HASPOSITIVE', 'INDICATIVE',
               'Please', 'Gratitude', 'HASHEDGE', '2nd person']]

    # NS training
    # Splitting up into 85% training, 15% verification
    NS_xtrain, NS_xtest, NS_ytrain, NS_ytest = train_test_split(X, data_NS['politeness'], test_size=0.1)
    extract_tests_NS.append((NS_xtest, NS_ytest))
    
    # NNS training
    NNS_xtrain, NNS_xtest, NNS_ytrain, NNS_ytest = train_test_split(X, data_NNS['politeness'], test_size=0.1)
    extract_tests_NNS.append((NNS_xtest, NNS_ytest))

    clfNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNS.fit(NS_xtrain, NS_ytrain)
    clfNNS = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clfNNS.fit(NNS_xtrain, NNS_ytrain)
    extract_classifiers_NS.append(clfNS)
    extract_classifiers_NNS.append(clfNNS)

for i in range(len(filenames)):
    print(filenames[i])
    print("native speaker:")
    print(extract_classifiers_NS[i].score(extract_tests_NS[i][0], extract_tests_NS[i][1]))
    print("non-native speaker:")
    print(extract_classifiers_NNS[i].score(extract_tests_NNS[i][0], extract_tests_NNS[i][1]))

BinaryLabeling.csv
native speaker:
0.58
non-native speaker:
0.71
StrongNeutralLabeling.csv
native speaker:
0.5
non-native speaker:
0.44
WeakNeutralLabeling.csv
native speaker:
0.65
non-native speaker:
0.69
IntermediateLabeling.csv
native speaker:
0.54
non-native speaker:
0.6
PartitionsLabeling.csv
native speaker:
0.29
non-native speaker:
0.35
