In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import nltk
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR, SVC
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import gensim
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import gensim.downloader as api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [55]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
api.load('text8')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/roboself/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/roboself/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


<text8.Dataset at 0x1556f6910>

In [56]:
def read_df(filename, names):
    df = pd.read_csv(filename, sep='\t', header=None, names=names)
    df["s_sub_token_len"] = [len(s.split()) for s in df["sub"]]
    df["s_sub_char_len"] = [len(s) for s in df["sub"]]
    df["s_sub_mean_word_len"] = df["s_sub_char_len"] / df["s_sub_token_len"]
    df["s_capitalized"] = [len([c for c in s if c.isupper()]) for s in df["sub"]]
    return df

In [57]:
COL_NAMES = ["idx", "text", "l", "r", "sub", "n1", "n2", "c1", "c2", "p"]
df = read_df('data/train_full.txt', COL_NAMES)
df.describe()

Unnamed: 0,idx,l,r,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_sub_mean_word_len,s_capitalized
count,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0
mean,7001.5,83.727753,92.100486,10.0,10.0,0.902014,0.860591,0.08813,1.220968,8.372732,6.816589,0.282745
std,4042.17357,66.602408,66.819266,0.0,0.0,1.949611,1.894848,0.181183,0.630302,5.086451,2.225215,0.577635
min,1.0,0.0,2.0,10.0,10.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0
25%,3501.25,32.0,40.0,10.0,10.0,0.0,0.0,0.0,1.0,5.0,5.0,0.0
50%,7001.5,71.0,79.0,10.0,10.0,0.0,0.0,0.0,1.0,7.0,7.0,0.0
75%,10501.75,120.0,129.0,10.0,10.0,1.0,1.0,0.1,1.0,9.0,8.0,0.0
max,14002.0,647.0,656.0,10.0,10.0,10.0,10.0,1.0,11.0,49.0,21.0,9.0


In [58]:
df.sample(50)

Unnamed: 0,idx,text,l,r,sub,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_sub_mean_word_len,s_capitalized
7921,7922,His speech came as an already difficult relati...,77,85,strained,10,10,6,1,0.35,1,8,8.0,0
422,423,"In northern Lebanon, meanwhile, residents said...",119,125,Sunday,10,10,0,0,0.0,1,6,6.0,1
4133,4134,The Taliban claimed responsibility for the ass...,58,65,heavily,10,10,0,0,0.0,1,7,7.0,0
4227,4228,While the government has taken steps to suppor...,21,36,has taken steps,10,10,0,1,0.05,3,15,5.0,0
6909,6910,Even a majority of Republicans hold a negative...,59,67,conflict,10,10,2,4,0.3,1,8,8.0,0
4174,4175,The attack on Rastan came after Syrian forces ...,142,147,homes,10,10,0,0,0.0,1,5,5.0,0
4664,4665,Israel's outgoing ambassador to Egypt arrived ...,155,161,forced,10,10,0,0,0.0,1,6,6.0,0
1440,1441,Regulators in the US and elsewhere have stress...,198,214,unfair advantage,10,10,2,2,0.2,2,16,8.0,0
9113,9114,U.S. military's future in Afghanistan Issued t...,231,240,increased,10,10,0,0,0.0,1,9,9.0,0
13956,13957,This trip to Afghanistan is an attempt to shor...,176,183,defense,10,10,0,0,0.0,1,7,7.0,0


In [59]:
#w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
#    './GoogleNews-vectors-negative300.bin', binary=True) 
w2v_model = api.load("glove-wiki-gigaword-100")
W2V_EMB_SIZE = 100
print("Loaded model!")

Loaded model!


In [82]:
df_train, df_val = train_test_split(df, test_size=0.1, shuffle=False)

In [83]:
POS_TAG_CACHE = {}

def pos_tag(text):
    global POS_TAG_CACHE
    if text not in POS_TAG_CACHE:
        POS_TAG_CACHE[text] = nltk.pos_tag(
            nltk.word_tokenize(text),
            tagset='universal',
        )
    return POS_TAG_CACHE[text]

In [84]:
DATA = {}

In [85]:
def get_syn(df, data, include_pos=True, testing=False):
    cols = []
    for col in df.columns:
        if col.startswith("s_"):
            cols.append(df[col].values)
    ret = np.column_stack(cols)
    
    if include_pos:
        pos_tags = []
        for idx in range(len(df)):
            
            tag = ""
            tokens = nltk.word_tokenize(df['sub'].values[idx])
            if len(tokens) == 1:
                target = tokens[0]
                for w, t in pos_tag(df['text'].values[idx]):
                    if w == target:
                        tag = t
                        break
                if tag != "NOUN" and tag != "ADJ" and tag != "VERB":
                    tag = ""
            pos_tags.append(tag)
        
        pos_tags = np.array(pos_tags).reshape(len(pos_tags), 1)
        unique, counts = np.unique(pos_tags, return_counts=True)
        print(list(zip(unique, counts)))
        # print(pos_tags)
        
        if not testing:
            enc = OneHotEncoder()
            enc.fit(pos_tags)
            data['syn'] = {
                'pos_enc': enc,
            }
        enc = data['syn']['pos_enc']
        print(enc.categories_)
        
        pos_tags_onehot = enc.transform(pos_tags)
        # pos_tags_enc = pos_tags_onehot.toarray()

        ret = np.hstack([ret, pos_tags_onehot.toarray()])
    return ret

X_syn_train = get_syn(df_train, DATA)
X_syn_val = get_syn(df_val, DATA, testing=True)
print(X_syn_train[10])
print(X_syn_train.shape)

[('', 2175), ('ADJ', 1192), ('NOUN', 7129), ('VERB', 2105)]
[array(['', 'ADJ', 'NOUN', 'VERB'], dtype='<U4')]
[('', 246), ('ADJ', 146), ('NOUN', 781), ('VERB', 228)]
[array(['', 'ADJ', 'NOUN', 'VERB'], dtype='<U4')]
[1. 3. 3. 0. 0. 0. 1. 0.]
(12601, 8)


In [108]:
def get_phon(df, data, testing=False):
    arpabet = nltk.corpus.cmudict.dict()
    phones = []
    for idx, sub in enumerate(df["sub"].values):
        phs = []
        for word in nltk.word_tokenize(sub.replace('-', ' ')):
            p = arpabet.get(word.lower())
            if not p:
                print("SKIPPING: ", word)
                continue
            phs.extend(p[0])
        phones.append(" ".join(phs))
    
    if not testing:
        tfidf = TfidfVectorizer().fit(phones)
        data['phon'] = {
            'tfidf': tfidf   
        }
    
    tfidf = data['phon']['tfidf']
    X_phon = tfidf.transform(phones)
    
    if not testing:
        pca = TruncatedSVD(n_components=15)
        pca.fit(X_phon)
        data['phon']['pca'] = pca
    pca = data['phon']['pca']
    
    X_phon = pca.transform(X_phon)
    return X_phon
        
X_phon_train = get_phon(df_train, DATA)
X_phon_val = get_phon(df_val, DATA, testing=True)
print(X_phon_train[5])
print(X_phon_train.shape)
print(X_phon_val.shape)

SKIPPING:  outcrops
SKIPPING:  outcrops
SKIPPING:  shoal
SKIPPING:  euros
SKIPPING:  n't
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  Bankia
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  Cloverhill
SKIPPING:  Tallaght
SKIPPING:  Inchicore
SKIPPING:  Gardai
SKIPPING:  Ballyfermot
SKIPPING:  Hayaleen
SKIPPING:  gunbattles
SKIPPING:  Alawite
SKIPPING:  Alawite
SKIPPING:  “
SKIPPING:  “
SKIPPING:  Twitter
SKIPPING:  Karzai
SKIPPING:  ,
SKIPPING:  've
SKIPPING:  're
SKIPPING:  Qaeda
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Qaeda
SKIPPING:  Qaeda
SKIPPING:  UK
SKIPPING:  GMT
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  neighbouring
SKIPPING:  Jazeera
SKIPPING:  armoured
SKIPPING:  armoured
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  NNA
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Hajar
SKIPPING:  Daraa
SKIPPING:  ,
SKIPPING:  Idleb
SKIPPING:  Daraa
SKIPPING:  Idleb
SKIPPING:  n't
SKIPPING:  LCC
SK

SKIPPING:  Putin
SKIPPING:  traumatised
SKIPPING:  traumatised
SKIPPING:  ambassadoe
SKIPPING:  ambassadoe
SKIPPING:  've
SKIPPING:  Qaida
SKIPPING:  U.S.
SKIPPING:  Qaeda
SKIPPING:  ’
SKIPPING:  Barack
SKIPPING:  Bagram
SKIPPING:  Bagram
SKIPPING:  Jaxs
SKIPPING:  Jazlin
SKIPPING:  Jaxs
SKIPPING:  Jaxs
SKIPPING:  Keqing
SKIPPING:  Shoal
SKIPPING:  Google
SKIPPING:  AFP
SKIPPING:  Google
SKIPPING:  smartphone
SKIPPING:  Google
SKIPPING:  Google
SKIPPING:  Fla
SKIPPING:  WFTV
SKIPPING:  Jazlin
SKIPPING:  Jaxs
SKIPPING:  WFTV
SKIPPING:  Fla
SKIPPING:  WKMG
SKIPPING:  euros
SKIPPING:  Guindos
SKIPPING:  Bankia
SKIPPING:  Bagram
SKIPPING:  AFP
SKIPPING:  eurozone
SKIPPING:  EU
SKIPPING:  EU
SKIPPING:  eurozone
SKIPPING:  EU
SKIPPING:  Olli
SKIPPING:  Olli
SKIPPING:  AP
SKIPPING:  Barack
SKIPPING:  Levanon
SKIPPING:  Levanon
SKIPPING:  EU
SKIPPING:  EU
SKIPPING:  EU
SKIPPING:  Gunbattles
SKIPPING:  EU
SKIPPING:  EU
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  Deir
SKIPPING:  Ezzor
SKIPPIN

In [87]:
def get_emb(df, data, window_size=20, vector_size=16, use_tfidf=True, testing=False):
    def preprocess_text(text):
        return nltk.word_tokenize(text.lower())
    
    if not testing:
        print("Loading dataset")
        dataset = list(api.load("text8")) + [preprocess_text(text) 
                   for text in np.unique(df['text'])] 
        print("Dataset loaded")
        dct = gensim.corpora.Dictionary(dataset)
        corpus = [dct.doc2bow(line) for line in dataset]
        tfidf = gensim.models.TfidfModel(corpus) 
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset)]
        print("Training Doc2Vec")
        d2v_model = Doc2Vec(documents, vector_size=vector_size, window=2, min_count=1, workers=4)
        print("DONE")
        data['emb'] = {
            "tfidf": tfidf,
            "dct": dct,
            "d2v": d2v_model,
        }
    emb_size = W2V_EMB_SIZE
    
    X_emb = np.zeros((len(df), 2 * emb_size + vector_size))
    for idx in range(len(df)):
        sub = df['sub'].values[idx]
        text = df['text'].values[idx]
        
        l, r = df['l'].values[idx], df['r'].values[idx]
        text_window = text[max(0, l - window_size):(r + window_size)]
        text_window = text_window.split(' ', 1)[1]
        text_window = text_window.rsplit(' ', 1)[0]
        
        now_emb = np.zeros((emb_size,))
        cnt = 0
        
        tfidf = data['emb']['tfidf']
        dct = data['emb']['dct']
        d2v_model = data['emb']['d2v']
        
        for token in nltk.word_tokenize(sub):
            try:
                emb = w2v_model[token]
                now_emb += emb
                cnt += 1
            except KeyError:
                #print(f"{token} not found")
                #print(text_window)
                pass
        if cnt > 0:
            now_emb /= cnt
            
        X_emb[idx, :emb_size] = now_emb
        
        now_emb = np.zeros((emb_size,))
        cnt = 0
        
        tokens = preprocess_text(text)
        text_coefs = dict(tfidf[dct.doc2bow(tokens)])
        X_emb[idx, -vector_size:] = d2v_model.infer_vector(tokens)
        
        for token in nltk.word_tokenize(text_window):
#             if token in sub:
#                 continue     
            if not use_tfidf:
                coef = 1.
            else:
                if token in dct.token2id:
                    token_id = dct.token2id[token]
                    if token_id not in text_coefs:
                        print(f"Skipping word: {token}")
                        print(text_window)
                        continue
                    coef = text_coefs[token_id]
                else:
                    continue
            try:
                emb = w2v_model[token]
                now_emb += emb * coef
                cnt += 1
            except KeyError:
                pass
        if cnt > 0:
            now_emb /= cnt
        
        X_emb[idx, emb_size:2*emb_size] = now_emb
    
    return X_emb

X_emb_train = get_emb(df_train, DATA)
X_emb_val = get_emb(df_val, DATA, testing=True)
print(X_emb_train[5])
print(X_emb_train.shape)
print(X_emb_val.shape)

Loading dataset
Dataset loaded
Training Doc2Vec
DONE
Skipping word: a.m
murders happened around 4:30 a.m.
Skipping word: a.m
neighbor heard shots at 4:50 a.m.
Skipping word: a.m
a text from Thomas at about 3 a.m.
Skipping word: a.m
home to mostly foreigners at about 6:15 a.m.
Skipping word: a.m
been hitting the town since three a.m.
Skipping word: said
that killed Osama bin Laden,'' he said.
Skipping word: .
that killed Osama bin Laden,'' he said.
Skipping word: said
killed Osama bin Laden,'' he said.
Skipping word: .
killed Osama bin Laden,'' he said.
Skipping word: a.m
message to her neighbor around 3 a.m.
[-9.35790002e-01  4.37780008e-01 -4.34281987e-02  1.74949914e-02
 -7.34104998e-02  7.34750181e-03 -5.98880008e-01  4.17229995e-01
 -3.14904988e-01  1.24536999e-01  4.04024988e-01  5.85644990e-01
  4.58794996e-01  1.49545498e-01  6.66410021e-01 -1.33249983e-02
  6.19604990e-01 -6.49274990e-01  3.49040002e-01  4.01990011e-01
  1.40910000e-02  2.07582496e-01  1.79164995e-01  5.1995009

In [88]:
def get_grams(df, data, testing=False):

    def preprocess_sub(sub):
        sub = sub.lower()
        tokenized = nltk.word_tokenize(sub)
        ngrams = []
        for token in tokenized:
            ngrams.extend(nltk.everygrams(token, 3, 4))
        ngrams = " ".join(["".join(ngram) for ngram in ngrams])
        return ngrams
    
    corpus = list(map(preprocess_sub, df["sub"].values))
    
    if not testing:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(corpus)
        data['grams'] = {
            'vectorizer': vectorizer,
        }
    vectorizer = data['grams']['vectorizer']
    
    X_grams = vectorizer.transform(corpus)
    
    if not testing:
        pca = TruncatedSVD(n_components=100)
        pca.fit(X_grams)
        data['grams']["pca"] = pca
    pca = data['grams']['pca']
    
    X_grams = pca.transform(X_grams)
    
    return X_grams

X_grams_train = get_grams(df_train, DATA)
X_grams_val = get_grams(df_val, DATA, testing=True)
print(X_grams_train[5])
print(X_grams_train.shape)
print(X_grams_val.shape)

[ 0.00489803 -0.00043715  0.00333822  0.00147761  0.00691381  0.01425793
 -0.00134855 -0.00022967  0.00058849  0.00267774  0.00026387  0.00391977
 -0.00162324  0.03954918 -0.00781259 -0.02051351 -0.00030832 -0.01968504
 -0.00118979  0.06359536  0.02754606 -0.01616442  0.00779618  0.01197732
 -0.00783731  0.01139048 -0.00472575  0.00146781  0.00214375  0.00157406
  0.00051824 -0.00086279  0.00981891  0.00290875 -0.00437111 -0.00338223
 -0.0007209  -0.00093276 -0.00100657  0.00759088  0.00617698  0.00014938
  0.00225338  0.0011104   0.00429761 -0.0093427   0.00434835  0.0056751
  0.01270168  0.00029909 -0.003898    0.0018445   0.00298269 -0.00061546
  0.00344711 -0.00455679  0.01002158 -0.00609832  0.00067433  0.0069031
  0.00393161 -0.00713831 -0.0005772  -0.00144176  0.00962234 -0.00525464
  0.00103503 -0.00223094 -0.00205426 -0.00157509  0.00178215  0.00323537
 -0.00219188 -0.00689994 -0.00127121  0.001787   -0.00684304  0.00757854
 -0.00236467 -0.01011315  0.00190429 -0.00111932  0.0

In [89]:
import re 

def get_lstm_repr(df, data, size=15, testing=False):
    if not testing:
        char_enc = LabelEncoder()
        chars = list(set([c for s in df["sub"].values for c in s.lower()]))
        chars = [c for c in chars if c.isalpha()] + [' ']
        print(chars)
        char_enc.fit(chars)
        data["lstm"] = {
            "char_enc": char_enc,
        }
    char_enc = data["lstm"]["char_enc"]
    
    X_lstm = np.zeros((len(df), size, len(char_enc.classes_)))
    for idx, s in enumerate(df["sub"].values):
        s = s.lower()
        s = re.sub('[^a-zA-Z]+', ' ', s)
        if len(s) > size:
            s = s[:size//2] + ' ' + s[-size//2:]
        s = s[:size]
        while len(s) + 1 < size:
            s = " " + s + " "
        if len(s) < size:
            s = s + " "
            
        chars = np.array(list(s.lower()))
        enc = char_enc.transform(chars)
        for jdx, x in enumerate(enc):
            X_lstm[idx][jdx][x] = 1
            
    return X_lstm

X_lstm_train = get_lstm_repr(df_train, DATA)
X_lstm_val = get_lstm_repr(df_val, DATA, testing=True)
print(X_lstm_train.shape)
print(X_lstm_val.shape)

['g', 'u', 'd', 'o', 'í', 'j', 'a', 't', 'w', 'k', 'm', 'p', 'x', 'q', 'y', 's', 'n', 'f', 'b', 'e', 'r', 'v', 'i', 'h', 'l', 'z', 'c', ' ']
(12601, 15, 28)
(1401, 15, 28)


In [90]:
y_train = df_train["p"].values
y_val = df_val["p"].values

In [109]:
def scale(X_syn, X_emb, X_grams, data, testing=False):
    if not testing:
        scaler_syn = StandardScaler()
        scaler_syn.fit(X_syn)
        scaler_emb = StandardScaler()
        scaler_emb.fit(X_emb)
        scaler_grams = StandardScaler()
        scaler_grams.fit(X_grams)
        data["scaler"] = {
            "syn": scaler_syn,
            "emb": scaler_emb,
            "grams": scaler_grams,
        }
    scaler_syn = data["scaler"]["syn"]
    X_syn = scaler_syn.transform(X_syn)
    scaler_emb = data["scaler"]["emb"]
    X_emb = scaler_emb.transform(X_emb)
    scaler_grams = data["scaler"]["grams"]
    X_grams = scaler_grams.transform(X_grams)
    
    return X_syn, X_emb, X_grams

#X_syn_train, X_emb_train, X_grams_train = scale(X_syn_train, X_emb_train, X_grams_train, DATA)
#X_syn_val, X_emb_val, X_grams_val = scale(X_syn_val, X_emb_val, X_grams_val, DATA, testing=True)

In [110]:
def loss(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [111]:
def abs_loss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred, 0, 1)
    return tf.math.abs(y_true - y_pred)

In [112]:
def get_model(syn_dim, phon_dim,emb_dim, gram_dim, lstm_shape, 
              use_syn=True, use_phon=True, use_emb=True, use_grams=True, use_lstm=True):
    syn_i = tfkl.Input(shape=(syn_dim,), name='syn_i')
    phon_i = tfkl.Input(shape=(phon_dim,), name='phon_i')
    emb_i = tfkl.Input(shape=(emb_dim,), name='emb_i')
    gram_i = tfkl.Input(shape=(gram_dim,), name='gram_i')
    lstm_i = tfkl.Input(shape=lstm_shape, name='lstm_i')
    
    syn_o = syn_i
    phon_o = phon_i
    emb_o = tfkl.Dropout(0.2, name='emb_dropout')(emb_i)
    #emb_o = emb_i
    gram_o = gram_i
    lstm_o = tfkl.Bidirectional(tfkl.LSTM(256), name='bilstm')(lstm_i)
    
    use = []
    if use_syn: use.append(syn_o)
    if use_phon: use.append(phon_o)
    if use_emb: use.append(emb_o)
    if use_grams: use.append(gram_o)
    if use_lstm: use.append(lstm_o)
    
    x = tfkl.Concatenate()(use)
    x = tfkl.Dense(512, activation='relu')(x)
    #x = tfkl.Dropout(0.3)(x)
    pred = tfkl.Dense(1)(x)
    
    return tfk.Model([syn_i, phon_i, emb_i, gram_i, lstm_i], pred)

In [None]:
model = get_model(
    X_syn_train.shape[1], 
    X_phon_train.shape[1],
    X_emb_train.shape[1], 
    X_grams_train.shape[1],
    X_lstm_train.shape[1:],
    #use_lstm=False,
)
optimizer = tfk.optimizers.Adam(learning_rate=1e-4)
model.compile(loss=tfk.losses.MAE, optimizer=optimizer, metrics=[abs_loss])

print(model.summary())

reduce_lr = tfk.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=np.sqrt(0.1),
    patience=5, 
    min_lr=1e-6,
    verbose=1
)
model.fit([X_syn_train, X_phon_train, X_emb_train, 
           X_grams_train, X_lstm_train], y_train, 
    batch_size=32,
    epochs=1000,
    validation_data=([X_syn_val, X_phon_val, X_emb_val, 
                      X_grams_val, X_lstm_val], y_val),
    callbacks=[reduce_lr],
)

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
emb_i (InputLayer)              [(None, 216)]        0                                            
__________________________________________________________________________________________________
lstm_i (InputLayer)             [(None, 15, 28)]     0                                            
__________________________________________________________________________________________________
syn_i (InputLayer)              [(None, 8)]          0                                            
__________________________________________________________________________________________________
phon_i (InputLayer)             [(None, 15)]         0                                            
___________________________________________________________________________________________

In [96]:
preds = np.zeros(X_syn_val.shape[0])
preds = model.predict([X_syn_val, X_phon_val, X_emb_val, X_grams_val, X_lstm_val]).ravel()
preds = np.clip(preds, 0, 1)
#preds = np.round(preds, 1)
print(loss(y_val, preds))

0.052389806133198366


In [None]:
#optimizer.learning_rate = optimizer.learning_rate * 10
#model.fit([X_syn, X_emb, X_grams, X_lstm], y, batch_size=32, epochs=20)

In [97]:
test_df = read_df('data/test.txt', COL_NAMES[:-3])
test_df.describe()

Unnamed: 0,idx,l,r,n1,n2,s_sub_token_len,s_sub_char_len,s_sub_mean_word_len,s_capitalized
count,1764.0,1764.0,1764.0,1764.0,1764.0,1764.0,1764.0,1764.0,1764.0
mean,14884.5,81.154762,89.397959,10.0,10.0,1.221088,8.243197,6.714957,0.277211
std,509.367255,57.635872,57.726803,0.0,0.0,0.627312,4.921544,2.078128,0.54593
min,14003.0,0.0,2.0,10.0,10.0,1.0,2.0,2.0,0.0
25%,14443.75,34.0,42.0,10.0,10.0,1.0,5.0,5.0,0.0
50%,14884.5,72.0,82.0,10.0,10.0,1.0,7.0,6.75,0.0
75%,15325.25,119.0,128.0,10.0,10.0,1.0,9.0,8.0,0.0
max,15766.0,272.0,284.0,10.0,10.0,6.0,42.0,15.0,5.0


In [98]:
test_df.head()

Unnamed: 0,idx,text,l,r,sub,n1,n2,s_sub_token_len,s_sub_char_len,s_sub_mean_word_len,s_capitalized
0,14003,Syrian troops shelled a rebel-held town on Mon...,7,13,troops,10,10,1,6,6.0,0
1,14004,Syrian troops shelled a rebel-held town on Mon...,0,6,Syrian,10,10,1,6,6.0,1
2,14005,Syrian troops shelled a rebel-held town on Mon...,14,21,shelled,10,10,1,7,7.0,0
3,14006,Syrian troops shelled a rebel-held town on Mon...,24,34,rebel-held,10,10,1,10,10.0,0
4,14007,Syrian troops shelled a rebel-held town on Mon...,51,59,sparking,10,10,1,8,8.0,0


In [99]:
X_syn_test = get_syn(test_df, DATA, testing=True)
X_phon_test = get_phon(test_df, DATA, testing=True)
X_emb_test = get_emb(test_df, DATA, testing=True)
X_grams_test = get_grams(test_df, DATA, testing=True)
X_lstm_test = get_lstm_repr(test_df, DATA, testing=True)

[('', 304), ('ADJ', 170), ('NOUN', 979), ('VERB', 311)]
[array(['', 'ADJ', 'NOUN', 'VERB'], dtype='<U4')]
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  Rastan
SKIPPING:  NBC
SKIPPING:  NBC
SKIPPING:  WESH
SKIPPING:  Ruqayya
SKIPPING:  Ruqayya
SKIPPING:  U.N.
SKIPPING:  U.N.
SKIPPING:  n't
SKIPPING:  Jaxs
SKIPPING:  Jazlin
SKIPPING:  kilometres
SKIPPING:  n't
SKIPPING:  BBs
SKIPPING:  BBs
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  Rajoy
SKIPPING:  Bankia
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  refocussed
SKIPPING:  refocussed
SKIPPING:  eurozone
SKIPPING:  BBVA
SKIPPING:  euros
SKIPPING:  BBVA
SKIPPING:  BBVA
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  Bankia
SKIPPING:  euros
SKIPPING:  Bankia
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  Facebook
SKIPPING:  Kolosvetov
SKIPPING:  U.S.
SKIPPING:  Karzai
SKIPPING:  Qaida
SKIPPING:  Adem
SKIPPING:  Ozkose
SKIPPING:  Hamit
SKIPPING:  Coskun
SKIPPING:  Ozkose
SKIPPING:  Coskun
SKIPPING:  Google
SKIP

In [None]:
mask_test = gate_clf.predict(np.hstack([X_syn_test, X_emb_test, X_grams_test])) == 1
print(mask_test)

In [102]:
y_pred_test = np.zeros(X_syn_test.shape[0])
y_pred_test = model.predict([X_syn_test, X_phon_test, X_emb_test, 
                             X_grams_test, X_lstm_test]).ravel()
y_pred_test = np.clip(y_pred_test, 0, 1)

print(y_pred_test.shape)
print(y_pred_test[:5])

(1764,)
[0.00240639 0.         0.64978576 0.38083166 0.4716571 ]


In [104]:
with open('new_submission.txt', 'w') as file:
    lines = ["id,label"]
    for idx, pred in zip(test_df["idx"], y_pred_test):
        lines.append(f"{idx},{pred}")
    file.write('\n'.join(lines))

In [105]:
def compare(f1, f2):
    preds1 = []
    preds2 = []
    with open(f1, 'r') as l1:
        for line in list(l1)[1:]:
            preds1.append(float(line.split(',')[1]))
    with open(f2, 'r') as l2:
        for line in list(l2)[1:]:
            preds2.append(float(line.split(',')[1]))
    
    diff = 0.
    for idx in range(len(preds1)):
        diff += abs(preds1[idx] - preds2[idx])
    return diff / len(preds1)

In [107]:
compare('new_submission.txt', 'submission_prob_050.txt')

0.038867770108239136