In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR, SVC
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import gensim
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

In [3]:
def read_df(filename, names):
    df = pd.read_csv(filename, sep='\t', header=None, names=names)
    df["s_sub_token_len"] = [len(s.split()) for s in df["sub"]]
    df["s_sub_char_len"] = [len(s) for s in df["sub"]]
    df["s_capitalized"] = [1 if s[0].isupper() else 0 for s in df["sub"]]
    return df

In [4]:
COL_NAMES = ["idx", "text", "l", "r", "sub", "n1", "n2", "c1", "c2", "p"]
train_df = read_df('data/train_full.txt', COL_NAMES)
train_df.describe()

Unnamed: 0,idx,l,r,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_capitalized
count,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0
mean,7001.5,83.727753,92.100486,10.0,10.0,0.902014,0.860591,0.08813,1.220968,8.372732,0.236323
std,4042.17357,66.602408,66.819266,0.0,0.0,1.949611,1.894848,0.181183,0.630302,5.086451,0.424838
min,1.0,0.0,2.0,10.0,10.0,0.0,0.0,0.0,1.0,2.0,0.0
25%,3501.25,32.0,40.0,10.0,10.0,0.0,0.0,0.0,1.0,5.0,0.0
50%,7001.5,71.0,79.0,10.0,10.0,0.0,0.0,0.0,1.0,7.0,0.0
75%,10501.75,120.0,129.0,10.0,10.0,1.0,1.0,0.1,1.0,9.0,0.0
max,14002.0,647.0,656.0,10.0,10.0,10.0,10.0,1.0,11.0,49.0,1.0


In [5]:
train_df.sample(50)

Unnamed: 0,idx,text,l,r,sub,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_capitalized
5287,5288,Sunni gunmen stand in the middle of Syria Stre...,99,107,northern,10,10,0,0,0.0,1,8,0
7515,7516,The shooting happened at a house in Port St. J...,45,49,John,10,10,0,0,0.0,1,4,1
1290,1291,Unless she left a note somewhere or told someb...,36,40,told,10,10,0,0,0.0,1,4,0
5667,5668,Mladic reportedly gave a thumbs-up and clapped...,68,73,court,10,10,0,0,0.0,1,5,0
3585,3586,Manila and Beijing contest sovereignty over th...,27,38,sovereignty,10,10,8,8,0.8,1,11,0
6647,6648,"You don’t need to sign agreements, you need to...",104,111,Mujahid,10,10,0,0,0.0,1,7,1
7003,7004,The Philippine Department of Foreign Affairs s...,103,110,sighted,10,10,1,2,0.15,1,7,0
13894,13895,Successive waves of bank sector clean-ups have...,25,31,sector,10,10,1,0,0.05,1,6,0
10129,10130,Ben-Dor said that Levanon's recently named suc...,76,81,leave,10,10,0,0,0.0,1,5,0
9175,9176,The Afghan people will understand that the Uni...,126,139,signing table,10,10,0,0,0.0,2,13,0


In [7]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
    './GoogleNews-vectors-negative300.bin', binary=True) 
print("Loaded model!")

Loaded model!


In [8]:
POS_TAG_CACHE = {}

def pos_tag(text):
    global POS_TAG_CACHE
    if text not in POS_TAG_CACHE:
        POS_TAG_CACHE[text] = nltk.pos_tag(
            nltk.word_tokenize(text),
            # tagset='universal',
        )
    return POS_TAG_CACHE[text]

In [9]:
DATA = {}

In [10]:
def get_syn(df, data, include_pos=True, testing=False):
    cols = []
    for col in df.columns:
        if col.startswith("s_"):
            cols.append(df[col].values)
    ret = np.column_stack(cols)
    
    if include_pos:
        pos_tags = []
        for idx in range(len(df)):
            target = nltk.word_tokenize(df['sub'][idx])[0]
            tag = ""
            for w, t in pos_tag(df['text'][idx]):
                if w == target:
                    tag = t
                    break
            pos_tags.append(tag)
        pos_tags = np.array(pos_tags).reshape(len(pos_tags), 1)
        # print(pos_tags)
        
        if not testing:
            enc = OneHotEncoder()
            enc.fit(pos_tags)
            data['syn'] = {
                'pos_enc': enc,
            }
        enc = data['syn']['pos_enc']
        print(enc.categories_)
        
        pos_tags_onehot = enc.transform(pos_tags)
        
        if not testing:
            pca = TruncatedSVD(n_components=3)
            pca.fit(pos_tags_onehot.toarray())
            data["syn"]["pos_pca"] = pca
        pca = data["syn"]["pos_pca"]
        
        pos_tags_enc = pca.transform(pos_tags_onehot.toarray())
        # pos_tags_enc = pos_tags_onehot.toarray()

        ret = np.hstack([ret, pos_tags_enc])
    return ret

X_syn = get_syn(train_df, DATA)
print(X_syn[10])
print(X_syn.shape)

[array(['', 'CC', 'CD', 'DT', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN',
       'NNP', 'NNPS', 'NNS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP',
       'TO', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WP'], dtype='<U4')]
[ 1.00000000e+00  3.00000000e+00  0.00000000e+00  1.00000000e+00
 -3.59690910e-16 -1.16335119e-16]
(14002, 6)


In [12]:
def get_emb(df, data, window_size=20, use_tfidf=True, testing=False):
    def preprocess_text(text):
        return nltk.word_tokenize(text)
    
    if not testing:
        dataset = [preprocess_text(text) 
                   for text in np.unique(df['text'])]
        dct = gensim.corpora.Dictionary(dataset)
        corpus = [dct.doc2bow(line) for line in dataset]
        tfidf = gensim.models.TfidfModel(corpus) 
        data['emb'] = {
            "tfidf": tfidf,
            "dct": dct,
        }
        
    X_emb = np.zeros((len(df), 600))
    for idx in range(len(df)):
        sub = df['sub'][idx]
        text = df['text'][idx]
        
        l, r = df['l'][idx], df['r'][idx]
        text_window = text[max(0, l - window_size):(r + window_size)]
        text_window = text_window.split(' ', 1)[1]
        text_window = text_window.rsplit(' ', 1)[0]
        
        now_emb = np.zeros((300,))
        cnt = 0
        
        tfidf = data['emb']['tfidf']
        dct = data['emb']['dct']
        
        for token in nltk.word_tokenize(sub):
            try:
                emb = w2v_model[token]
                now_emb += emb
                cnt += 1
            except KeyError:
                #print(f"{token} not found")
                #print(text_window)
                pass
        if cnt > 0:
            now_emb /= cnt
            
        X_emb[idx, :300] = now_emb
        
        now_emb = np.zeros((300,))
        cnt = 0
        
        text_coefs = dict(tfidf[dct.doc2bow(preprocess_text(text))])
        for token in nltk.word_tokenize(text_window):
#             if token in sub:
#                 continue     
            if not use_tfidf:
                coef = 1.
            else:
                if token in dct.token2id:
                    token_id = dct.token2id[token]
                    if token_id not in text_coefs:
                        print(f"Skipping word: {token}")
                        print(text_window)
                        continue
                    coef = text_coefs[token_id]
                else:
                    continue
            try:
                emb = w2v_model[token]
                now_emb += emb * coef
                cnt += 1
            except KeyError:
                pass
        if cnt > 0:
            now_emb /= cnt
        
        X_emb[idx, 300:] = now_emb
    return X_emb

X_emb = get_emb(train_df, DATA)
print(X_emb[5])
print(X_emb.shape)

Skipping word: St
attempt to hold an unsanctioned rally in St.
Skipping word: St
of Russians who rallied in Moscow and St.
Skipping word: a.m
murders happened around 4:30 a.m.
Skipping word: Mr
former President Bill Clinton touted Mr.
Skipping word: a.m
neighbor heard shots at 4:50 a.m.
Skipping word: a.m
a text from Thomas at about 3 a.m.
Skipping word: St
of Russians have rallied in Moscow and St.
Skipping word: Co
companies such as Samsung Electronics Co.
Skipping word: Mr
arriving, about midnight local time, Mr.
Skipping word: St
of Russians rallied in Moscow and St.
Skipping word: a.m
home to mostly foreigners at about 6:15 a.m.
Skipping word: a.m
been hitting the town since three a.m.
Skipping word: said
that killed Osama bin Laden,'' he said.
Skipping word: .
that killed Osama bin Laden,'' he said.
Skipping word: said
killed Osama bin Laden,'' he said.
Skipping word: .
killed Osama bin Laden,'' he said.
Skipping word: Ms
reported having heard gun shots from Ms.
Skipping word: Ms

In [21]:
def get_grams(df, data, testing=False):

    def preprocess_sub(sub):
        sub = sub.lower()
        tokenized = nltk.word_tokenize(sub)
        ngrams = []
        for token in tokenized:
            ngrams.extend(nltk.everygrams(token, 3, 4))
        ngrams = " ".join(["".join(ngram) for ngram in ngrams])
        return ngrams
    
    corpus = list(map(preprocess_sub, df["sub"].values))
    
    if not testing:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(corpus)
        data['grams'] = {
            'vectorizer': vectorizer,
        }
    vectorizer = data['grams']['vectorizer']
    
    X_grams = vectorizer.transform(corpus)
    
    if not testing:
        pca = TruncatedSVD(n_components=100)
        pca.fit(X_grams)
        data['grams']["pca"] = pca
    pca = data['grams']['pca']
    
    X_grams = pca.transform(X_grams)
    
    return X_grams

X_grams = get_grams(train_df, DATA)
print(X_grams[5])
print(X_grams.shape)

[ 4.20994460e-03  2.41875527e-03  1.45912071e-03  1.18640497e-03
  6.93149702e-03  1.28309318e-02 -9.50120082e-04  3.27232859e-04
  2.24984707e-03 -1.97380126e-04  7.22735488e-04  2.46352006e-03
  6.13570363e-03 -1.00471580e-03  2.93746752e-02 -2.31674008e-03
  5.45249728e-03 -2.28555090e-02  1.26885347e-02 -2.02274460e-02
 -5.75864206e-03 -5.84090412e-02  4.14391021e-02  2.16054999e-02
  9.66581042e-03 -2.13307335e-03 -1.82379976e-03 -4.98562694e-03
  4.37075790e-03  3.39416911e-03 -4.04889637e-03 -4.15266584e-03
  8.87028033e-04 -4.05616019e-03 -5.01468341e-03 -2.27706021e-03
 -2.17869112e-03  4.19123843e-03  2.00424557e-03  1.05889863e-03
  2.18452243e-03  9.61164113e-04 -4.24813612e-03  1.12775108e-03
  4.84566365e-03  5.53398615e-03  8.86778526e-03 -6.98947963e-03
  6.45702942e-03  9.05866736e-04 -5.68522997e-03 -4.67949041e-03
  5.61149084e-03 -2.96899019e-03 -7.30236172e-03 -1.00448028e-03
  3.48220088e-03 -1.35930277e-03  2.32022880e-03 -8.27001653e-03
 -5.77110950e-03  8.83283

In [22]:
def scale(X, data, testing=False):
    if not testing:
        scaler = StandardScaler()
        scaler.fit(X)
        data["scaler"] = scaler
    scaler = data["scaler"]
    
    return scaler.transform(X)

In [23]:
X = np.hstack([X_emb, X_grams])
# X = scale(X, DATA)
y = train_df["p"].values
# idx = (y > 0)
# X = X[idx]
# y = y[idx]
# print(idx)
print(X.shape)
print(y.shape)

(14002, 700)
(14002,)


In [24]:
def loss(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [25]:
def abs_loss(y_true, y_pred):
    ret = tf.math.abs(y_true - y_pred)
    return tf.math.maximum(ret, 0)

In [62]:
def get_model(syn_dim, emb_dim, lstm_shape):
    syn_i = tfkl.Input(shape=(syn_dim,), name='syn_i')
    emb_i = tfkl.Input(shape=(emb_dim,), name='emb_i')
    lstm_i = tfkl.Input(shape=lstm_shape, name='lstm_i')
    
    syn_o = syn_i
    emb_o = tfkl.Dropout(0.7, name='emb_dropout')(emb_i)
    lstm_o = tfkl.Bidirectional(tfkl.LSTM(64), name='bilstm')(lstm_i)
    
    x = tfkl.Concatenate()([syn_o, emb_o, lstm_o])
    x = tfkl.Dense(300, activation='relu')(x)
    pred = tfkl.Dense(1)(x)
    
    return tfk.Model([syn_i, emb_i, lstm_i], pred)

In [63]:
def get_lstm_repr(df, data, training=False):
    if not training:
        char_enc = LabelEncoder()
        chars = list(set([c for s in df["sub"].values for c in s.lower()]))
        char_enc.fit(chars)
        data["lstm"] = {
            "char_enc": char_enc,
        }
    char_enc = data["lstm"]["char_enc"]
    
    X_lstm = np.zeros((len(df), 50, len(char_enc.classes_)))
    for idx, s in enumerate(df["sub"].values):
        s = s.lower()
        while len(s) + 1 < 50:
            s = " " + s + " "
        if len(s) < 50:
            s = s + " "
            
        chars = np.array(list(s.lower()))
        enc = char_enc.transform(chars)
        for jdx, x in enumerate(enc):
            X_lstm[idx][jdx][x] = 1
            
    return X_lstm
        
X_lstm = get_lstm_repr(train_df, DATA)
print(X_lstm.shape)

(14002, 50, 41)


In [None]:
model = get_model(X_syn.shape[1], X_emb.shape[1], X_lstm.shape[1:])
optimizer = tfk.optimizers.Adam(learning_rate=1e-3)
model.compile(loss=abs_loss, optimizer=optimizer)

print(model.summary())

reduce_lr = tfk.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=np.sqrt(0.1),
    patience=5, 
    min_lr=1e-5,
    verbose=1
)

model.fit(
    [X_syn, X_emb, X_lstm], y, 
    batch_size=32,
    epochs=1000,
    validation_split=0.2,
    callbacks=[reduce_lr],
)





Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
emb_i (InputLayer)              [(None, 600)]        0                                            
__________________________________________________________________________________________________
lstm_i (InputLayer)             [(None, 50, 41)]     0                                            
__________________________________________________________________________________________________
syn_i (InputLayer)              [(None, 6)]          0                                            
__________________________________________________________________________________________________
emb_dropout (Dropout)           (None, 600)          0           emb_i[0][0]                      
____________________________________________________________________________________________







Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000

In [None]:
test_df = read_df('data/test.txt', COL_NAMES[:-3])
test_df.describe()

In [None]:
test_df.head()

In [None]:
X_test = np.hstack([
    get_syn(test_df, DATA, testing=True), 
    get_emb(test_df, DATA, testing=True), 
    get_grams(test_df, DATA, testing=True),
])
print(X_test.shape)

In [None]:
predictions = model.predict(X_test).ravel()
predictions = np.clip(predictions, 0, 1)
print(predictions.shape)
print(predictions[:5])

In [None]:
with open('submission.txt', 'w') as file:
    lines = ["id,label"]
    for idx, pred in zip(test_df["idx"], predictions):
        lines.append(f"{idx},{pred:.1f}")
    file.write('\n'.join(lines))