In [1]:
import pandas as pd 
import utils.preprocessing
from utils.utils import make_submission
import numpy as np

df = pd.read_csv('data/raw/offenseval-training-v1.tsv', sep='\t')
utils.preprocessing.clean(df)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/barthelemyduthoit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c,clean_tweets,tokens
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,,ask nativ american,"[ask, nativ, american]"
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,home drunk manga trump,"[home, drunk, manga, trump]"
2,16820,Amazon is investigating Chinese employees who ...,NOT,,,amazon investig chines employe sell intern dat...,"[amazon, investig, chines, employe, sell, inte..."
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,someon retaken piec shit volcano,"[someon, retaken, piec, shit, volcano]"
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,obama want liber amp illeg move red state,"[obama, want, liber, amp, illeg, move, red, st..."


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

X = df["clean_tweets"]

vec = CountVectorizer(ngram_range=(1,3), 
                      stop_words='english', 
                      max_features=32500)

df_test = pd.read_csv('data/test/task_a/testset-taska.tsv', sep='\t')
utils.preprocessing.clean(df_test)

## Subtask A

In [3]:
X_train = vec.fit_transform(X)
X_test = vec.transform(df_test["clean_tweets"])
y_train = df["subtask_a"].map({"OFF":0, "NOT":1})

### Logistic Regression

In [4]:
clf = LogisticRegression(C=4, dual=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#make_submission(y_pred, {0:"OFF", 1:"NOT"}, df_test, "submissions/taska_logreg.csv")



### LSTM

In [None]:
from keras.utils import to_categorical
from keras.preprocessing import sequence
from utils.keras_utils import f1_loss
from keras.models import Sequential
from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, Dense
from keras.preprocessing import text

tokenizer = text.Tokenizer(num_words=15000)

model = Sequential()
model.add(Embedding(15000, 16, input_length=100))
model.add(Bidirectional(LSTM(6, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(6, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(2, activation="softmax"))

y_cat = to_categorical(y_train)
tokenizer.fit_on_texts(X)
list_tokenized_train = tokenizer.texts_to_sequences(X)
X_test = df_test["clean_tweets"]
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_tr = sequence.pad_sequences(list_tokenized_train, maxlen=100)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=100)
model.compile(loss=f1_loss, optimizer='adam', metrics=['accuracy'])
model.fit(X_tr, y_cat, epochs=4,  batch_size=64)
y_pred_proba = model.predict(X_te)
y_pred = np.argmax(y_pred_proba, axis=1)
#make_submission(y_pred, {0:"OFF", 1:"NOT"}, df_test, "submissions/taska_lstm.csv")

### Ensemble RF + Logistic Regression

In [12]:
logreg = LogisticRegression(C=4, dual=True)
logreg.fit(X_train, y_train)
y_pred_proba_logreg = logreg.predict_proba(X_test)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_proba_rf = rf.predict_proba(X_test)

y_pred_proba = y_pred_proba_logreg + y_pred_proba_rf
y_pred = np.argmax(y_pred_proba, axis=1)
make_submission(y_pred, {0:"OFF", 1:"NOT"}, df_test, "submissions/taska_logreg+rf.csv")

