In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR, SVC
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import gensim
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [38]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('cmudict')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/roboself/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/roboself/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/roboself/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [41]:
def read_df(filename, names):
    df = pd.read_csv(filename, sep='\t', header=None, names=names)
    df["s_sub_token_len"] = [len(s.split()) for s in df["sub"]]
    df["s_sub_char_len"] = [len(s) for s in df["sub"]]
    df["s_capitalized"] = [len([c for c in s if c.isupper()]) for s in df['sub']]
    return df

In [42]:
COL_NAMES = ["idx", "text", "l", "r", "sub", "n1", "n2", "c1", "c2", "p"]
df = read_df('data/train_full.txt', COL_NAMES)
df.describe()

Unnamed: 0,idx,l,r,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_capitalized
count,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0,14002.0
mean,7001.5,83.727753,92.100486,10.0,10.0,0.902014,0.860591,0.08813,1.220968,8.372732,0.282745
std,4042.17357,66.602408,66.819266,0.0,0.0,1.949611,1.894848,0.181183,0.630302,5.086451,0.577635
min,1.0,0.0,2.0,10.0,10.0,0.0,0.0,0.0,1.0,2.0,0.0
25%,3501.25,32.0,40.0,10.0,10.0,0.0,0.0,0.0,1.0,5.0,0.0
50%,7001.5,71.0,79.0,10.0,10.0,0.0,0.0,0.0,1.0,7.0,0.0
75%,10501.75,120.0,129.0,10.0,10.0,1.0,1.0,0.1,1.0,9.0,0.0
max,14002.0,647.0,656.0,10.0,10.0,10.0,10.0,1.0,11.0,49.0,9.0


In [43]:
df.sample(50)

Unnamed: 0,idx,text,l,r,sub,n1,n2,c1,c2,p,s_sub_token_len,s_sub_char_len,s_capitalized
11556,11557,The Taliban said it was in response to Obama's...,64,73,strategic,10,10,1,3,0.2,1,9,0
11551,11552,The Taliban claimed responsibility for the att...,196,206,passers-by,10,10,0,0,0.0,1,10,0
4887,4888,The incident followed the killing in August of...,106,115,militants,10,10,6,2,0.4,1,9,0
6500,6501,"The other children -- Joel Johnson, 12, Jazlin...",105,111,Thomas,10,10,0,0,0.0,1,6,1
12380,12381,"PRESIDENT Barack Obama, speaking to a US telev...",78,82,Base,10,10,0,0,0.0,1,4,1
5328,5329,Rastan has in the past been a major source of ...,0,6,Rastan,10,10,0,1,0.05,1,6,1
2241,2242,Officials and witnesses said a suicide car bom...,72,81,disguised,10,10,5,9,0.7,1,9,0
1177,1178,"Another man and a woman, both aged 28, were ar...",8,11,man,10,10,0,0,0.0,1,3,0
2309,2310,Part of the plan includes the deployment in fl...,75,86,UN military,10,10,0,0,0.0,2,11,2
9742,9743,He credited the Afghan security forces for q...,205,212,members,10,10,0,0,0.0,1,7,0


In [36]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
    './GoogleNews-vectors-negative300.bin', binary=True) 
print("Loaded model!")

Loaded model!


In [44]:
df_train, df_val = train_test_split(df, test_size=0.2, shuffle=False)

In [45]:
POS_TAG_CACHE = {}

def pos_tag(text):
    global POS_TAG_CACHE
    if text not in POS_TAG_CACHE:
        POS_TAG_CACHE[text] = nltk.pos_tag(
            nltk.word_tokenize(text),
            tagset='universal',
        )
    return POS_TAG_CACHE[text]

In [46]:
DATA = {}

In [47]:
def get_syn(df, data, include_pos=True, testing=False):
    cols = []
    for col in df.columns:
        if col.startswith("s_"):
            cols.append(df[col].values)
    ret = np.column_stack(cols)
    
    if include_pos:
        pos_tags = []
        for idx in range(len(df)):
            target = nltk.word_tokenize(df['sub'].values[idx])[0]
            tag = ""
            for w, t in pos_tag(df['text'].values[idx]):
                if w == target:
                    tag = t
                    break
            pos_tags.append(tag)
        pos_tags = np.array(pos_tags).reshape(len(pos_tags), 1)
        # print(pos_tags)
        
        if not testing:
            enc = OneHotEncoder()
            enc.fit(pos_tags)
            data['syn'] = {
                'pos_enc': enc,
            }
        enc = data['syn']['pos_enc']
        print(enc.categories_)
        
        pos_tags_onehot = enc.transform(pos_tags)
        
        if not testing:
            pca = TruncatedSVD(n_components=3)
            pca.fit(pos_tags_onehot.toarray())
            data["syn"]["pos_pca"] = pca
        pca = data["syn"]["pos_pca"]
        
        pos_tags_enc = pca.transform(pos_tags_onehot.toarray())
        # pos_tags_enc = pos_tags_onehot.toarray()

        ret = np.hstack([ret, pos_tags_enc])
    return ret

X_syn_train = get_syn(df_train, DATA)
X_syn_val = get_syn(df_val, DATA, testing=True)
print(X_syn_train[10])
print(X_syn_train.shape)

[array(['', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON',
       'PRT', 'VERB', 'X'], dtype='<U4')]
[array(['', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON',
       'PRT', 'VERB', 'X'], dtype='<U4')]
[ 1.00000000e+00  3.00000000e+00  0.00000000e+00  1.00000000e+00
 -5.76321607e-18 -1.11022302e-16]
(11201, 6)


In [86]:
def get_phon(df, data, testing=False):
    arpabet = nltk.corpus.cmudict.dict()
    phones = []
    for idx, sub in enumerate(df["sub"].values):
        phs = []
        for word in nltk.word_tokenize(sub):
            p = arpabet.get(word.lower())
            if not p:
                print("SKIPPING: ", word)
                continue
            phs.extend(p[0])
        phones.append(" ".join(phs))
    
    if not testing:
        tfidf = TfidfVectorizer(max_features=15).fit(phones)
        data['phon'] = {
            'tfidf': tfidf   
        }
    
    tfidf = data['phon']['tfidf']
    X_phon = tfidf.transform(phones).todense()
    return X_phon
        
X_phon_train = get_phon(df_train, DATA)
X_phon_val = get_phon(df_val, DATA, testing=True)
print(X_phon_train[5])
print(X_phon_train.shape)
print(X_phon_val.shape)

SKIPPING:  outcrops
SKIPPING:  outcrops
SKIPPING:  shoal
SKIPPING:  euros
SKIPPING:  n't
SKIPPING:  London-based
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  euros
SKIPPING:  Bankia
SKIPPING:  long-time
SKIPPING:  Rastan
SKIPPING:  Britain-based
SKIPPING:  Britain-based
SKIPPING:  rebel-held
SKIPPING:  rebel-held
SKIPPING:  Rastan
SKIPPING:  al-Assad
SKIPPING:  al-Assad
SKIPPING:  Cloverhill
SKIPPING:  Tallaght
SKIPPING:  Inchicore
SKIPPING:  Gardai
SKIPPING:  Ballyfermot
SKIPPING:  Ex-Soviet
SKIPPING:  Hayaleen
SKIPPING:  gunbattles
SKIPPING:  Alawite
SKIPPING:  Alawite
SKIPPING:  “
SKIPPING:  “
SKIPPING:  Twitter
SKIPPING:  Karzai
SKIPPING:  ,
SKIPPING:  've
SKIPPING:  're
SKIPPING:  al-Qaeda
SKIPPING:  Putin
SKIPPING:  pro-Putin
SKIPPING:  pro-Putin
SKIPPING:  Putin
SKIPPING:  Putin
SKIPPING:  Afghan-US
SKIPPING:  commander-in-chief
SKIPPING:  election-year
SKIPPING:  Al-Qaeda
SKIPPING:  Al-Qaeda
SKIPPING:  decade-long
SKIPPING:  UK-based
SKIPPING:  GMT
SKIPPING:  Rastan
SKIPPING:  

SKIPPING:  Jazlin
SKIPPING:  Jaxs
SKIPPING:  ex-Soviet
SKIPPING:  Putin
SKIPPING:  two-thirds
SKIPPING:  vote-rigging
SKIPPING:  ballot-box
SKIPPING:  ballot-box
SKIPPING:  ballot-box
SKIPPING:  n't
SKIPPING:  Zabiullah
SKIPPING:  Mujahid
SKIPPING:  Zabiullah
SKIPPING:  Mujahid
SKIPPING:  Karzai
SKIPPING:  Karzai
SKIPPING:  n't
SKIPPING:  Jaxs
SKIPPING:  Jazzlyn
SKIPPING:  n't
SKIPPING:  DFA
SKIPPING:  BRP
SKIPPING:  PF-15
SKIPPING:  Palawan
SKIPPING:  Zhonggou
SKIPPING:  Shoal
SKIPPING:  Haijian
SKIPPING:  Zhonggou
SKIPPING:  Haijian
SKIPPING:  PF-15
SKIPPING:  al-Qaeda
SKIPPING:  war-weary
SKIPPING:  war-weary
SKIPPING:  Post-ABC
SKIPPING:  decade-long
SKIPPING:  Post-ABC
SKIPPING:  two-thirds
SKIPPING:  decade-long
SKIPPING:  Keqing
SKIPPING:  Shoal
SKIPPING:  shoal
SKIPPING:  north-western
SKIPPING:  Zambales
SKIPPING:  shoal
SKIPPING:  Spratly
SKIPPING:  BRP
SKIPPING:  five-year
SKIPPING:  still-troubled
SKIPPING:  still-troubled
SKIPPING:  clean-up
SKIPPING:  clean-up
SKIPPING:  

SKIPPING:  pro-Putin
SKIPPING:  pro-Putin
SKIPPING:  Ex-Soviet
SKIPPING:  part-nationalise
SKIPPING:  part-nationalise
SKIPPING:  Bankia
SKIPPING:  Bankia
SKIPPING:  fourth-largest
SKIPPING:  Jaxs
SKIPPING:  n't
SKIPPING:  Rockledge
SKIPPING:  Fla
SKIPPING:  n't
SKIPPING:  Guindos
SKIPPING:  Jazeera
SKIPPING:  euros
SKIPPING:  decade-long
SKIPPING:  decade-long
SKIPPING:  eurozone
SKIPPING:  eurozone
SKIPPING:  euros
SKIPPING:  clean-ups
SKIPPING:  war-zone
SKIPPING:  war-zone
SKIPPING:  campaign-related
[[0.         0.         0.46783728 0.         0.88381462 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]]
(11201, 15)
(2801, 15)


In [87]:
def get_emb(df, data, window_size=20, use_tfidf=True, testing=False):
    def preprocess_text(text):
        return nltk.word_tokenize(text)
    
    if not testing:
        dataset = [preprocess_text(text) 
                   for text in np.unique(df['text'])]
        dct = gensim.corpora.Dictionary(dataset)
        corpus = [dct.doc2bow(line) for line in dataset]
        tfidf = gensim.models.TfidfModel(corpus) 
        data['emb'] = {
            "tfidf": tfidf,
            "dct": dct,
        }
        
    X_emb = np.zeros((len(df), 600))
    for idx in range(len(df)):
        sub = df['sub'].values[idx]
        text = df['text'].values[idx]
        
        l, r = df['l'].values[idx], df['r'].values[idx]
        text_window = text[max(0, l - window_size):(r + window_size)]
        text_window = text_window.split(' ', 1)[1]
        text_window = text_window.rsplit(' ', 1)[0]
        
        now_emb = np.zeros((300,))
        cnt = 0
        
        tfidf = data['emb']['tfidf']
        dct = data['emb']['dct']
        
        for token in nltk.word_tokenize(sub):
            try:
                emb = w2v_model[token]
                now_emb += emb
                cnt += 1
            except KeyError:
                #print(f"{token} not found")
                #print(text_window)
                pass
        if cnt > 0:
            now_emb /= cnt
            
        X_emb[idx, :300] = now_emb
        
        now_emb = np.zeros((300,))
        cnt = 0
        
        text_coefs = dict(tfidf[dct.doc2bow(preprocess_text(text))])
        for token in nltk.word_tokenize(text_window):
#             if token in sub:
#                 continue     
            if not use_tfidf:
                coef = 1.
            else:
                if token in dct.token2id:
                    token_id = dct.token2id[token]
                    if token_id not in text_coefs:
                        print(f"Skipping word: {token}")
                        print(text_window)
                        continue
                    coef = text_coefs[token_id]
                else:
                    continue
            try:
                emb = w2v_model[token]
                now_emb += emb * coef
                cnt += 1
            except KeyError:
                pass
        if cnt > 0:
            now_emb /= cnt
        
        X_emb[idx, 300:] = now_emb
    return X_emb

X_emb_train = get_emb(df_train, DATA)
X_emb_val = get_emb(df_val, DATA, testing=True)
print(X_emb_train[5])
print(X_emb_train.shape)
print(X_emb_val.shape)

Skipping word: St
attempt to hold an unsanctioned rally in St.
Skipping word: St
of Russians who rallied in Moscow and St.
Skipping word: a.m
murders happened around 4:30 a.m.
Skipping word: Mr
former President Bill Clinton touted Mr.
Skipping word: a.m
neighbor heard shots at 4:50 a.m.
Skipping word: a.m
a text from Thomas at about 3 a.m.
Skipping word: St
of Russians have rallied in Moscow and St.
Skipping word: Co
companies such as Samsung Electronics Co.
Skipping word: Mr
arriving, about midnight local time, Mr.
Skipping word: St
of Russians rallied in Moscow and St.
Skipping word: a.m
home to mostly foreigners at about 6:15 a.m.
Skipping word: a.m
been hitting the town since three a.m.
Skipping word: said
that killed Osama bin Laden,'' he said.
Skipping word: .
that killed Osama bin Laden,'' he said.
Skipping word: said
killed Osama bin Laden,'' he said.
Skipping word: .
killed Osama bin Laden,'' he said.
Skipping word: Ms
reported having heard gun shots from Ms.
Skipping word: Ms

In [88]:
def get_grams(df, data, testing=False):

    def preprocess_sub(sub):
        sub = sub.lower()
        tokenized = nltk.word_tokenize(sub)
        ngrams = []
        for token in tokenized:
            ngrams.extend(nltk.everygrams(token, 3, 4))
        ngrams = " ".join(["".join(ngram) for ngram in ngrams])
        return ngrams
    
    corpus = list(map(preprocess_sub, df["sub"].values))
    
    if not testing:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(corpus)
        data['grams'] = {
            'vectorizer': vectorizer,
        }
    vectorizer = data['grams']['vectorizer']
    
    X_grams = vectorizer.transform(corpus)
    
    if not testing:
        pca = TruncatedSVD(n_components=300)
        pca.fit(X_grams)
        data['grams']["pca"] = pca
    pca = data['grams']['pca']
    
    X_grams = pca.transform(X_grams)
    
    return X_grams

X_grams_train = get_grams(df_train, DATA)
X_grams_val = get_grams(df_val, DATA, testing=True)
print(X_grams_train[5])
print(X_grams_train.shape)
print(X_grams_val.shape)

[ 5.68464175e-03  3.65903864e-04  5.14053923e-04  4.06450197e-03
  5.85791469e-03  1.49643251e-02 -2.44619311e-03  3.20479226e-03
  3.19487528e-03 -7.64507253e-04  2.00284016e-03  4.66683134e-03
 -2.68313051e-03  3.13842289e-02  9.90042090e-03  3.33393338e-03
  3.83956314e-02 -2.73281188e-02 -9.78660702e-03 -3.86733837e-03
  5.18673064e-02  4.14641186e-02  1.18302935e-02  5.55241006e-03
 -5.73246729e-03  1.22192292e-02 -2.90223010e-03  3.13566649e-03
  8.55245669e-04  6.99988262e-04  3.89572345e-04 -5.45310874e-03
 -1.26023194e-03  4.56785985e-03  6.79204436e-04 -2.66888028e-03
 -8.34357879e-04 -2.14457753e-03 -4.56859748e-03  5.01982281e-04
  2.88203247e-03  7.20423536e-03 -4.64491386e-03 -2.39016018e-03
 -2.08460759e-03  9.83078783e-04  2.11373650e-03 -6.66563613e-04
  5.96244790e-03 -4.46345847e-03 -5.22437238e-03  4.89143089e-03
 -4.96537609e-03  1.20619810e-02  5.79279757e-04 -8.08210633e-03
  1.79185149e-03  2.44107417e-03 -9.11098441e-04  2.03612901e-03
  2.98630768e-03 -7.23641

In [89]:
def scale(X, data, testing=False):
    if not testing:
        scaler = StandardScaler()
        scaler.fit(X)
        data["scaler"] = scaler
    scaler = data["scaler"]
    
    return scaler.transform(X)

In [90]:
def loss(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [91]:
def abs_loss(y_true, y_pred):
    ret = tf.math.abs(y_true - y_pred)
    return tf.math.maximum(ret, 0)

In [92]:
X_train = np.hstack([X_syn_train, X_phon_train, X_emb_train, X_grams_train])
y_train = df_train["p"].values
X_train = scale(X_train, DATA)

X_val = np.hstack([X_syn_val, X_phon_val, X_emb_val, X_grams_val])
y_val = df_val["p"].values
X_val = scale(X_val, DATA, testing=True)

# idx = (y > 0)
# X = X[idx]
# y = y[idx]
# print(idx)
print(X_train.shape)
print(y_train.shape)

(11201, 921)
(11201,)


In [93]:
print(X_train.mean())
print(X_train.std())
print(X_val.mean())
print(X_val.std())

2.2993874044919776e-17
0.9999999999999999
-0.0025842353106073435
1.0335617417418765


In [84]:
def get_model(input_dim, keep_dim):
    inp = tfkl.Input(shape=(input_dim,))
    inp_drop = tfkl.Lambda(lambda t: t[:, keep_dim:])(inp)
    inp_drop = tfkl.Dropout(0.3)(inp_drop)
    inp_keep = tfkl.Lambda(lambda t: t[:, :keep_dim])(inp)
    inp_conc = tfkl.Concatenate()([inp_drop, inp_keep])
    h = tfkl.Dense(300, activation='relu')(inp_conc)
    pred = tfkl.Dense(1)(h)
    return tfk.Model(inp, pred)

In [85]:
model = get_model(X_train.shape[1], X_syn_train.shape[1] + X_phon_train.shape[1])
optimizer = tfk.optimizers.Adam(learning_rate=1e-3)
model.compile(loss=abs_loss, optimizer=optimizer)

print(model.summary())

reduce_lr = tfk.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=np.sqrt(0.1),
    patience=5, 
    min_lr=1e-5,
    verbose=1
)

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=1000,
    callbacks=[reduce_lr],
)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 916)]        0                                            
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 900)          0           input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 900)          0           lambda_6[0][0]                   
__________________________________________________________________________________________________
lambda_7 (Lambda)               (None, 16)           0           input_4[0][0]                    
____________________________________________________________________________________________

Epoch 00051: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000


_NotOkStatusException: InvalidArgumentError: Error while reading CompositeTensor._type_spec.

In [None]:
test_df = read_df('data/test.txt', COL_NAMES[:-3])
test_df.describe()

In [None]:
test_df.head()

In [None]:
X_test = np.hstack([
    get_syn(test_df, DATA, testing=True), 
    get_emb(test_df, DATA, testing=True), 
    get_grams(test_df, DATA, testing=True),
])
print(X_test.shape)

In [None]:
predictions = model.predict(X_test).ravel()
predictions = np.clip(predictions, 0, 1)
print(predictions.shape)
print(predictions[:5])

In [None]:
with open('submission.txt', 'w') as file:
    lines = ["id,label"]
    for idx, pred in zip(test_df["idx"], predictions):
        lines.append(f"{idx},{pred:.1f}")
    file.write('\n'.join(lines))