# Experiments

In [44]:
import numpy as np
import pandas as pd
import sys
import mysql.connector
# Add the ptdraft folder path to the sys.path list
sys.path.append('../')
from labov import datasets, classifier, evaluation
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle
import spacy
nlp = spacy.load('en', disable=['parser'])

In [2]:
df = pd.read_json('../../data/labov01_b.json', lines=True)
df.sample(5)

Unnamed: 0,text,user_id,y
7382,If I can give this place a zero I would!!!! Ma...,4_XXK2SawW_gsdq9dF33Xg,1
54101,Dinner has always been excellent so no complai...,R4bOjG3V6UAD8w56_CjEJA,2
28654,"Best Margs EVER!!! Seriously. Cheap beer, grea...",gHQtfx1BP13ob8qTmm9hXQ,1
33259,Is it just me or are a lot of the restaurants ...,Iec0Qou1MH6zk7I4y_4nrA,2
16371,"I really love how clean it is here, they use f...",Bd0CpWVJbYCl0iI4gLqkuQ,2


In [None]:
def tag(text):
    tags = ' '.join((w.pos_ for w in nlp(text)))
    return tags

xs = shuffle(df.text, n_samples=10)

In [3]:
smallest = df.y.value_counts().min()
tmp = []
for label in set(df.y):
    tmp.append(df[df.y == label].sample(smallest, random_state=42))
df_balanced = pd.concat(tmp)

## Training and CV on balanced dataset

### Lexicalized model

In [68]:
%%time
x, y = shuffle(df_balanced.text, df_balanced.y, random_state=42)
# x = [tag(x) for x in xs]
print("running the baseline...")
random = cross_val_score(classifier.random, x, y, cv=5, n_jobs=-1)
print("done!")
print("Accuracy: %0.2f (+/- %0.2f)" % (random.mean(), random.std() * 2))

print("fitting model...")
model_lexi = classifier.simple.fit(x, y)
y_pred = cross_val_predict(classifier.simple, x, y, cv=10, n_jobs=-1)
print("done!")
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(evaluation.run(y_pred, y))

running the baseline...
done!
Accuracy: 0.24 (+/- 0.05)
fitting model...
done!
0.4765625
             precision    recall  f1-score   support

          1       0.55      0.51      0.53       139
          2       0.34      0.39      0.36       114
          3       0.46      0.48      0.47       124
          4       0.55      0.52      0.53       135

avg / total       0.48      0.48      0.48       512

[[71 40 13 15]
 [31 44 27 12]
 [12 22 59 31]
 [14 22 29 70]]
0
CPU times: user 906 ms, sys: 114 ms, total: 1.02 s
Wall time: 4.97 s


### Bleaching

In [93]:
from collections import Counter
wordCount = Counter(df_balanced.text.str.cat(sep='\n').split())

def bleach_0(text:str, word_dic=None)->str:
    shapes = ' '.join([w.shape_ for w in nlp(text)])
    return shapes

def bleach_1(text:str, word_dic=None)->str:
    bleached = ' '.join(['_'.join((w.shape_, 
                                   str(len(w)))) for w in nlp(text)])
    return bleached

def bleach_2(text:str, word_dic=None)->str:
    bleached = ' '.join(['_'.join((w.shape_, 
                                   str(len(w)),
                                   w.pos_)) for w in nlp(text)])
    return bleached


def bleach_3(text:str, word_dic=None)->str:
    bleached = ' '.join(['_'.join((w.shape_, 
                                   str(len(w)),
                                   str(w.is_alpha),
                                   approximate(w.text),
                                   str(wordCount[w.text]))) for w in nlp(text)])
    return bleached

def bleach_4(text:str, word_dic=None)->str:
    bleached = ' '.join(['_'.join((w.shape_, 
                                   str(len(w)),
                                   w.pos_,
                                   str(w.is_alpha))) for w in nlp(text)])
    return bleached

def lexibleach_0(text:str, word_dic=None)->str:
    bleached = ' '.join(['_'.join((w.shape_, 
                                   str(len(w)),
                                   w.pos_,
                                   w.text,
                                   str(w.is_alpha))) for w in nlp(text)])
    return bleached

def lexionly(text:str, word_dic=None)->str:
    bleached = ' '.join([''.join((w.text)) for w in nlp(text)])
    return bleached

import re

def approximate(word):
    vowels = '[aeiou]'
    consonants = '[^aeiou]'
    others = '[^a-Z]' # TODO
    tmp = re.sub(consonants, 'C', word.replace(" ",""))
    result = re.sub(vowels,'V', tmp)
    return result

In [94]:
%%time
x, y = shuffle(df_balanced.text.apply(lambda x: bleach_3(x)), 
               df_balanced.y, 
               random_state=42)

print("fitting model...")
model = classifier.simple.fit(x, y)
y_pred = cross_val_predict(classifier.simple, x, y, cv=10, n_jobs=-1)
print("done")
print(evaluation.run(y, y_pred))

fitting model...
done
0.443359375
             precision    recall  f1-score   support

          1       0.44      0.49      0.47       128
          2       0.32      0.30      0.31       128
          3       0.46      0.45      0.45       128
          4       0.53      0.54      0.53       128

avg / total       0.44      0.44      0.44       512

[[63 41 12 12]
 [46 38 26 18]
 [15 25 57 31]
 [18 13 28 69]]
0
CPU times: user 37.1 s, sys: 31.1 s, total: 1min 8s
Wall time: 22.3 s


### Train bleached model on all dataset

In [99]:
nonen = pd.read_json('../../data/non-english-author.json', lines=True)
allen = pd.concat([df[['text', 'y']], nonen])

In [100]:
allen_min = allen.y.value_counts().min()

allen_tmp = []
for label in set(allen.y):
    allen_tmp.append(allen[allen.y == label].sample(allen_min, random_state=42))
allen_corpus = pd.concat(allen_tmp)

In [103]:
print("fitting model...")
x = allen_corpus.text.apply(lambda x: bleach_3(x))
y = allen_corpus.y
y_pred = cross_val_predict(classifier.ngram, x, y, cv=10, n_jobs=-1)
print("done")
print(evaluation.run(y, y_pred))

fitting model...
done
0.4836409395973154
             precision    recall  f1-score   support

          1       0.50      0.51      0.50       596
          2       0.43      0.42      0.43       596
          3       0.45      0.43      0.44       596
          4       0.54      0.58      0.56       596

avg / total       0.48      0.48      0.48      2384

[[302 193  51  50]
 [210 253  87  46]
 [ 55  94 255 192]
 [ 35  45 173 343]]
0


## Testing

In [60]:
%%time
# here we test on the rest of the dataset
test = pd.concat([df, df_balanced]).drop_duplicates(keep=False)
test_smallest = test.y.value_counts().min()

test_tmp = []
for label in set(test.y):
    test_tmp.append(test[test.y == label].sample(test_smallest, random_state=42))
test_corpus = pd.concat(test_tmp)

print(test_corpus.shape)

y_test = model.predict(test_corpus.text.apply(lambda x: bleach_3(x)))

print(classification_report(test_corpus.y, y_test))
print(accuracy_score(test_corpus.y, y_test))

(5181, 3)
             precision    recall  f1-score   support

          1       0.52      0.55      0.53      1727
          2       0.41      0.30      0.35      1727
          3       0.65      0.50      0.57      1727
          4       0.00      0.00      0.00         0

avg / total       0.53      0.45      0.48      5181

0.45010615711252655
CPU times: user 7min 7s, sys: 5min 51s, total: 12min 58s
Wall time: 3min 49s


  'recall', 'true', average, warn_for)


## Multi-lingual testing

In [61]:
%%time
# testing the non lexicalized model
multilang = pd.read_json('../../data/non-english-author.json', lines=True)
multi_test_smallest = multilang.y.value_counts().min()

multi_tmp = []
for label in set(multilang.y):
    multi_tmp.append(multilang[multilang.y == label].sample(multi_test_smallest, random_state=42))
multi_corpus = pd.concat(multi_tmp)

print(multi_corpus.shape)

y_test = model.predict(multi_corpus.text.apply(lambda x: bleach_3(x)))

print(classification_report(multi_corpus.y, y_test))
print(accuracy_score(multi_corpus.y, y_test))

(1872, 2)
             precision    recall  f1-score   support

          1       0.55      0.07      0.13       468
          2       0.26      0.19      0.22       468
          3       0.27      0.75      0.40       468
          4       0.34      0.13      0.18       468

avg / total       0.36      0.29      0.24      1872

0.28739316239316237
CPU times: user 1min 34s, sys: 1min 21s, total: 2min 56s
Wall time: 50.6 s


In [66]:
%%time
# testing the lexicalized model
multilang = pd.read_json('../../data/non-english-author.json', lines=True)
multi_test_smallest = multilang.y.value_counts().min()

multi_tmp = []
for label in set(multilang.y):
    multi_tmp.append(multilang[multilang.y == label].sample(multi_test_smallest, random_state=42))
multi_corpus = pd.concat(multi_tmp)

print(multi_corpus.shape)

y_test = model_lexi.predict(multi_corpus.text)

print(classification_report(multi_corpus.y, y_test))
print(accuracy_score(multi_corpus.y, y_test))

(1872, 2)
             precision    recall  f1-score   support

          1       0.30      0.01      0.02       468
          2       0.00      0.00      0.00       468
          3       0.25      0.15      0.19       468
          4       0.26      0.87      0.40       468

avg / total       0.20      0.26      0.15      1872

0.25801282051282054
CPU times: user 2.5 s, sys: 776 µs, total: 2.5 s
Wall time: 2.5 s


## Feature analysis

In [104]:
model.named_steps['features'].get_feature_names()

['X',
 'X X',
 'X X_12_False_CCVCCVCCCCCC_1',
 'X X_3_False_CCC_1',
 'X Xx_5_False_CCCCC_1',
 'X Xxxxx_9_False_CCCCVCCVC_1',
 'X Xxxxx_9_False_CCCCVCVVC_1',
 'X _2_False_CC_1',
 'X _4_False_CCCC_0',
 'X _4_False_CCCC_1',
 'X _6_False_CCCCCC_1',
 'X dd_4_False_CCCC_0',
 'X x_14_False_CVCCVCVCCCCCCC_1',
 'X x_3_False_CCC_89',
 'X x_7_False_CVCCCCC_1',
 'X x_8_False_CVCCCCCC_1',
 'X xx_10_False_CVVCCCCCCV_1',
 'X xx_10_False_VVCCCCCCCV_1',
 'X xx_11_False_CCCVCCCCCCV_1',
 'X xx_11_False_CCVCVCCCCCV_1',
 'X xx_11_False_CCVVCCCCCCV_1',
 'X xx_12_False_CCVCVCCCCCCV_1',
 'X xx_14_False_CCCCCCCCCCCCCV_1',
 'X xx_15_False_CVCVCVVVCCCCCCV_1',
 'X xx_4_False_CCVV_1',
 'X xx_5_False_CCCCV_0',
 'X xxx_5_False_CCCVC_1',
 'X xxx_5_False_CCCVV_0',
 'X xxxx_9_False_CCVCVCCVC_0',
 'XX',
 'XX X_4_False_CCCC_1',
 'XX Xx_6_False_CCCCCV_1',
 'XXX',
 'XXX X_5_False_CCCCC_1',
 'XXX X_7_False_CCCCCCC_1',
 'XXX Xx_7_False_CCCCCCV_1',
 'XXX Xxxx_9_False_CCCCCCCVV_0',
 'XXX Xxxxx_11_False_CCCCCCVVCCC_1',
 'XXXX',

## Neural Experiments

