## Subtask B - Classify UNT / TIN

In [11]:
import pandas as pd
import utils.preprocessing
from utils.test_model import *
from utils.utils import make_submission
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('data/raw/offenseval-training-v1.tsv', sep='\t')
utils.preprocessing.clean(df)
df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,clean_tweets,tokens
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,ask nativ american,"[ask, nativ, american]"
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,home drunk manga trump,"[home, drunk, manga, trump]"
2,16820,Amazon is investigating Chinese employees who ...,NOT,,,amazon investig chines employe sell intern dat...,"[amazon, investig, chines, employe, sell, inte..."
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,someon retaken piec shit volcano,"[someon, retaken, piec, shit, volcano]"
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,obama want liber amp illeg move red state,"[obama, want, liber, amp, illeg, move, red, st..."


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

X = df["clean_tweets"]
df_test = pd.read_csv('data/test/task_b/testset-taskb.tsv', sep='\t')
utils.preprocessing.clean(df_test)
df_test.head()

Unnamed: 0,id,tweet,clean_tweets,tokens
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,whose wherestheserv dumpsit declasfisa democra...,"[whose, wherestheserv, dumpsit, declasfisa, de..."
1,60133,#NoPasaran: Unity demo to oppose the far-right...,nopasaran uniti demo oppos far right london an...,"[nopasaran, uniti, demo, oppos, far, right, lo..."
2,83681,. . . What the fuck did he do this time?,fuck time,"[fuck, time]"
3,65507,@USER Do you get the feeling he is kissing @US...,feel kiss behind humili later,"[feel, kiss, behind, humili, later]"
4,12588,@USER Nigga ware da hits at,nigga ware hit,"[nigga, ware, hit]"


### Preprocessing to remomve NaNs in labels

In [13]:
vec = CountVectorizer(ngram_range=(1,3), stop_words='english', max_features=38269)

y_train = df["subtask_b"].map({"UNT":0, "TIN":1})

all_tin = pd.Series(1, index=np.arange(len(y_train)))
y_fill = y_train.copy()
y_fill[np.isnan(y_fill)] = 0
print('F1 score ALL TIN', round(metrics.f1_score(y_fill,all_tin),4))

# remove rows with NaN as label in y_train and X then build X_train
(nan_idx,) = np.where(~np.isnan(y_train))
print('Nan proportion (removed): {}%'.format(100*round(1-len(nan_idx) / y_train.shape[0], 4)))

# clean from NaNs ans reset index to prevent errors in testing
y_train = y_train.dropna().reset_index()['subtask_b']
X_new = X.iloc[nan_idx].reset_index()['clean_tweets']

X_train = vec.fit_transform(X_new)
X_test = vec.transform(df_test["clean_tweets"])

print(y_train.value_counts())

print('Category split in cleaned dataset:')
print('\tTIN: {}%'.format(100* round(y_train.value_counts()[1] / y_train.count(), 4)))
print('\tUNT: {}%'.format(100* round(y_train.value_counts()[0] / y_train.count(), 4)))

F1 score ALL TIN 0.4529
Nan proportion (removed): 66.77%
1.0    3876
0.0     524
Name: subtask_b, dtype: int64
Category split in cleaned dataset:
	TIN: 88.09%
	UNT: 11.91%


### Logistic Regression

In [14]:
logreg = LogisticRegression(C=2, dual=False, class_weight='balanced', solver='sag')
print(logreg)
print('=================================================================')

# Train Model
print('=========================== Training ============================')
logreg.fit(X_train, y_train)

# Evaluate Model
print('========================== Evaluating ===========================')
print('Accuracy:\t{}'.format(round(logreg.score(X_train, y_train),4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_logreg, acc_logreg) = test_single_model(logreg, X_new, y_train, vec, n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_logreg,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_logreg,4)))

# Predict on test set and write submission
y_pred_proba_logreg = logreg.predict_proba(X_test)
y_pred_logreg = np.argmax(y_pred_proba_logreg,1)
make_submission(y_pred_logreg, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_logreg.csv")

LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='sag', tol=0.0001, verbose=0, warm_start=False)




Accuracy:	0.9711
Mean F1 score (3-fold CV):	0.5815
Mean Accuracy (3-fold CV):	0.8277


### Ridge Regression

In [15]:
ridge = RidgeClassifier(alpha=1e-2, solver='sag')
print(ridge)
print('=================================================================')

# Train Model
print('=========================== Training ============================')
ridge.fit(X_train, y_train)

# Evaluate Model
print('========================== Evaluating ===========================')
print('Accuracy:\t{}'.format(round(ridge.score(X_train, y_train),4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_ridge, acc_ridge) = test_single_model(ridge, X_new, y_train, vec, n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_ridge,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_ridge,4)))

# Predict on test set and write submission
y_pred_ridge = ridge.predict(X_test)
make_submission(y_pred_ridge, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_ridge.csv")

RidgeClassifier(alpha=0.01, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=None, solver='sag', tol=0.001)
Accuracy:	0.9891
Mean F1 score (3-fold CV):	0.5231
Mean Accuracy (3-fold CV):	0.8211


### Linear SVC

In [29]:
from sklearn.svm import SVC

svc = SVC(decision_function_shape='ovo', probability=True, gamma='auto')
print(svc)
print('=================================================================')

# Train Model
print('=========================== Training ============================')
svc.fit(X_train, y_train)

# Evaluate Model
print('========================== Evaluating ===========================')
print('Accuracy:\t{}'.format(round(svc.score(X_train, y_train),4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_svc, acc_svc) = test_single_model(svc, X_new, y_train, vec, n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_svc,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_svc,4)))

# Predict on test set and write submission
y_pred_proba_svc = svc.predict_proba(X_test)
y_pred_svc = np.argmax(y_pred_proba_svc,1)
make_submission(y_pred_svc, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_svc.csv")

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy:	0.8809
Mean F1 score (3-fold CV):	0.4683
Mean Accuracy (3-fold CV):	0.8809


### Random Forest

##### data “augmentation“

In [30]:
from scipy.sparse import hstack

word_vec = CountVectorizer(ngram_range=(1,3), stop_words='english',analyzer='word', max_features=32500)
train_word_features = word_vec.fit_transform(X_new)
test_word_features = word_vec.fit_transform(df_test["clean_tweets"])

char_vec = CountVectorizer(ngram_range=(1,3), stop_words='english',analyzer='char', max_features=32500)
train_char_features = char_vec.fit_transform(X_new)
test_char_features = char_vec.fit_transform(df_test["clean_tweets"])

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [31]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini',
            max_depth=100, max_features=1000, max_leaf_nodes=50,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
print(rfc)
print('=================================================================')

# Train Model
print('=========================== Training ============================')
rfc.fit(train_features, y_train)

# Evaluate Model
print('========================== Evaluating ===========================')
print('Accuracy:\t{}'.format(round(rfc.score(train_features, y_train),4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_rfc, acc_rfc) = test_single_model(rfc, X_new, y_train, [word_vec, char_vec], 
                                      n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_rfc,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_rfc,4)))

# Predict on test set and write submission
y_pred_proba_rfc = rfc.predict_proba(X_test)
y_pred_rfc = np.argmax(y_pred_proba_rfc,1)
make_submission(y_pred_rfc, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_rfc.csv")

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features=1000,
            max_leaf_nodes=50, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy:	0.7882
Mean F1 score (3-fold CV):	0.5697
Mean Accuracy (3-fold CV):	0.7664


In [32]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini',
            max_depth=100, max_features=1000, max_leaf_nodes=50,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
print(rfc)
print('=================================================================')

# Train Model
print('=========================== Training ============================')
rfc.fit(X_train, y_train)

# Evaluate Model
print('========================== Evaluating ===========================')
print('Accuracy:\t{}'.format(round(rfc.score(X_train, y_train),4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_rfc, acc_rfc) = test_single_model(rfc, X_new, y_train, vec, 
                                      n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_rfc,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_rfc,4)))

# Predict on test set and write submission
y_pred_proba_rfc = rfc.predict_proba(X_test)
y_pred_rfc = np.argmax(y_pred_proba_rfc,1)
make_submission(y_pred_rfc, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_rfc.csv")

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features=1000,
            max_leaf_nodes=50, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy:	0.7302
Mean F1 score (3-fold CV):	0.5609
Mean Accuracy (3-fold CV):	0.717


### Ensemble : RF + Logistic Regression

In [34]:
y_pred_proba_ens = (y_pred_proba_logreg + y_pred_proba_rfc) / 2.0
y_pred_ens = np.argmax(y_pred_proba_ens, 1)
make_submission(y_pred_ens, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_ens_rf_lr.csv")

f1_lr_rf, acc_lf_rf = test_voting_model([logreg, rfc], X_new, y_train, {'sklearn':vec}, n_splits=3, random_state=1)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_lr_rf,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_lf_rf,4)))



Mean F1 score (3-fold CV):	0.5933
Mean Accuracy (3-fold CV):	0.8193


### Imports & Manip for Deep Learning methods

In [6]:
from keras.utils import to_categorical
from keras.preprocessing import sequence
from utils.keras_utils import f1_loss
from keras.models import Sequential
from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, Dense
from keras.preprocessing import text
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
from keras.utils.vis_utils import plot_model
import graphviz

tokenizer = text.Tokenizer(num_words=15000)
tokenizer.fit_on_texts(X_new)

X_test = df_test["clean_tweets"]
list_tokenized_train = tokenizer.texts_to_sequences(X_new)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)

y_cat = to_categorical(y_train)

X_tr = sequence.pad_sequences(list_tokenized_train, maxlen=100)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=100)

### CNN

In [9]:
# Define Model
cnn = Sequential()
cnn.add(Embedding(15000, 16, input_length=100))
cnn.add(Conv1D(32, 5, activation='relu'))
cnn.add(MaxPooling1D(4))
cnn.add(Flatten())
cnn.add(Dropout(rate=0.25))
cnn.add(Dense(units=16, activation='relu'))
cnn.add(Dropout(rate=0.15))
cnn.add(Dense(units=8, activation='relu'))
cnn.add(Dense(units=2, activation='softmax'))
cnn.compile(loss=f1_loss, optimizer='adam', metrics=['accuracy'])
print(cnn.summary())
print('=================================================================')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 16)           240000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 96, 32)            2592      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 24, 32)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 768)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                12304     
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
__________

In [None]:
# Train Model
print('=========================== Training ============================')
cnn.fit(X_tr, y_cat, epochs=10,  batch_size=64, verbose=0)

# Evaluate Model
print('========================== Evaluating ===========================')
evaluation_cnn = cnn.evaluate(X_tr, y_cat, batch_size=64, verbose=0)
print('{}:\t{}'.format(cnn.metrics_names[0], round(evaluation_cnn[0],4)))
print('{}:\t{}'.format(cnn.metrics_names[1], round(evaluation_cnn[1],4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_cnn, acc_cnn) = test_single_model(cnn, X_new, y_train, tokenizer, epochs=10,
                                      n_splits=3, random_state=1, verbose=0)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_cnn,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_cnn,4)))

# Predict on test set and write submission
y_pred_proba_cnn = cnn.predict(X_te)
y_pred_cnn = np.argmax(y_pred_proba_cnn, axis=1)
make_submission(y_pred_cnn, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_cnn.csv")

### LSTM

In [9]:
# Define Model
lstm = Sequential()
lstm.add(Embedding(15000, 16, input_length=100))
lstm.add(Bidirectional(LSTM(16, return_sequences=True)))
lstm.add(GlobalMaxPool1D())
lstm.add(Dropout(0.15))
lstm.add(Dense(8, activation="relu"))
lstm.add(Dense(2, activation="softmax"))
lstm.compile(loss=f1_loss, optimizer='adam', metrics=['accuracy'])
print(lstm.summary())
print('=================================================================')

# Train Model
print('=========================== Training ============================')
lstm.fit(X_tr, y_cat, epochs=10,  batch_size=64, verbose=0)

# Evaluate Model
print('========================== Evaluating ===========================')
evaluation_lstm = lstm.evaluate(X_tr, y_cat, batch_size=64, verbose=0)
print('{}:\t{}'.format(lstm.metrics_names[0], round(evaluation_lstm[0],4)))
print('{}:\t{}'.format(lstm.metrics_names[1], round(evaluation_lstm[1],4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_lstm, acc_lstm) = test_single_model(lstm, X_new, y_train, tokenizer, epochs=10,
                                        n_splits=3, random_state=1, verbose=0)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_lstm,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_lstm,4)))

# Predict on test set and write submission
y_pred_proba_lstm = lstm.predict(X_te)
y_pred_lstm = np.argmax(y_pred_proba_lstm, axis=1)
make_submission(y_pred_lstm, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_lstm.csv")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 16)           240000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 32)           4224      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 18        
Total params: 244,506
Trainable params: 244,506
Non-trainable params: 0
_________________________________________________________________
None

### Bi-LSTM + ConvLayer

In [36]:
blc = Sequential()
blc.add(Embedding(15000, 8, input_length=100, trainable=True))
blc.add(Bidirectional(LSTM(4, return_sequences=True, dropout=0.15, recurrent_dropout=0.15)))
blc.add(Conv1D(8, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'))
blc.add(GlobalMaxPool1D())
blc.add(Dropout(0.5))
blc.add(Dense(16, activation='relu'))
blc.add(Dense(2, activation='softmax'))
blc.compile(loss=f1_loss, optimizer='adam', metrics=['accuracy'])
print(blc.summary())
print('=================================================================')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 100, 8)            120000    
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 100, 8)            416       
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 98, 8)             200       
_________________________________________________________________
global_max_pooling1d_13 (Glo (None, 8)                 0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 8)                 0         
_________________________________________________________________
dense_39 (Dense)             (None, 16)                144       
_________________________________________________________________
dense_40 (Dense)             (None, 2)                 34        
Total para

In [None]:
blc = Sequential()
blc.add(Embedding(15000, 16, input_length=100, trainable=True))
blc.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.15, recurrent_dropout=0.15)))
blc.add(Conv1D(16, kernel_size=5))
blc.add(Dropout(0.25))
blc.add(Conv1D(8, kernel_size=5))
blc.add(Dropout(0.25))
blc.add(Conv1D(8, kernel_size=5))
blc.add(Flatten())
blc.add(Dropout(0.5))
blc.add(Dense(8, activation='relu'))
blc.add(Dense(2, activation='softmax'))
blc.compile(loss=f1_loss, optimizer='adam', metrics=['accuracy'])
print(blc.summary())
print('=================================================================')

In [43]:
# Train Model
print('=========================== Training ============================')
blc.fit(X_tr, y_cat, epochs=4,  batch_size=64, verbose=0)

# Evaluate Model
print('========================== Evaluating ===========================')
evaluation_blc = blc.evaluate(X_tr, y_cat, batch_size=64, verbose=0)
print('{}:\t{}'.format(blc.metrics_names[0], round(evaluation_blc[0],4)))
print('{}:\t{}'.format(blc.metrics_names[1], round(evaluation_blc[1],4)))

# K-Fold Cross Validation
print('========================== Validating ===========================')
(f1_blc, acc_blc) = test_single_model(blc, X_new, y_train, tokenizer, epochs=4,
                                      n_splits=3, random_state=1, verbose=0)
print('Mean F1 score (3-fold CV):\t{}'.format(round(f1_blc,4)))
print('Mean Accuracy (3-fold CV):\t{}'.format(round(acc_blc,4)))

# Predict on test set and write submission
y_pred_proba_blc = blc.predict(X_te)
y_pred_blc = np.argmax(y_pred_proba_blc, axis=1)
make_submission(y_pred_blc, {0:"UNT", 1:"TIN"}, df_test, "submissions/taskb_blc.csv")

loss:	0.3864
acc:	0.8845
Mean F1 score (3-fold CV):	0.6554
Mean Accuracy (3-fold CV):	0.845


## Semi Supervised Learning ?