In [0]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [0]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

--2019-03-25 05:28:45--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.6.166, 104.20.22.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.6.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2019-03-25 05:30:40 (12.8 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]



In [0]:
!ls

crawl-300d-2M.vec  gdrive  sample_data


In [0]:
!unzip crawl-300d-2M.vec.zip

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [0]:
!rm crawl-300d-2M.vec.zip

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
#!ls "/content/gdrive/My Drive/Kaggle"
!cp "/content/gdrive/My Drive/Kaggle/sample_submission.csv" sample_submission.csv

In [0]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'

# !cd "/content/gdrive/My Drive"
train = pd.read_csv("/content/gdrive/My Drive/Kaggle/train.csv")
test = pd.read_csv("/content/gdrive/My Drive/Kaggle/test.csv")
submission = pd.read_csv("/content/gdrive/My Drive/Kaggle/sample_submission.csv")
# !cd

In [0]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


max_features = 30000
maxlen = 100
embed_size = 300

In [0]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [0]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [0]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [0]:
batch_size = 32
epochs = 2


In [0]:
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('submission.csv', index=False)

In [0]:
from keras.models import load_model

model = load_model('/content/gdrive/My Drive/Kaggle/gru_fasttext.h5')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [0]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_base.csv', index=False)
!cp submission_base.csv "/content/gdrive/My Drive/Kaggle/submission_base.csv"

In [0]:
reddit_comments = pd.read_csv("/content/gdrive/My Drive/Kaggle/comment_stream_117k.csv")


In [0]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [0]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [0]:

reddit_tokens = tokenizer.texts_to_sequences(reddit_comments["comment"])
reddit_sequences = sequence.pad_sequences(reddit_tokens, maxlen=maxlen)


reddit_pred = model.predict(reddit_sequences, batch_size = 1024)

In [0]:
reddit_pred_new = reddit_pred>0.5
model_new = model

In [0]:
np.shape(reddit_sequences)

(117242, 100)

In [0]:
model_new.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     9000000     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 160)     182880      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

In [0]:
reddit_sequences_temp = reddit_sequences[:30000]
reddit_pred_new_temp = reddit_pred_new[:30000]

In [0]:
newhist = model_new.fit(reddit_sequences, reddit_pred_new, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[RocAuc], verbose=1) 

Train on 117242 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986182 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.984886 



In [0]:
model_new.save('pseudo_label_full.h5')

!cp pseudo_label_full.h5 "/content/gdrive/My Drive/Kaggle/pseudo_label_full.h5"

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = drive.CreateFile({'title': 'pseudo_label.csv'})
uploaded.SetContentFile('pseudo_label.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

In [0]:
from keras.models import load_model

model = load_model('/content/gdrive/My Drive/pseudo_label.h5')

OSError: ignored

In [0]:
new_preds = model_new.predict(x_test, batch_size = 1024)

In [0]:

submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = new_preds
submission.to_csv('submission.csv', index=False)

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = drive.CreateFile({'title': 'submission.csv'})
uploaded.SetContentFile('submission.csv')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

[?25l[K    1% |▎                               | 10kB 4.4MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.3MB/s eta 0:00:01[K    3% |█                               | 30kB 2.0MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.5MB/s eta 0:00:01[K    5% |█▋                              | 51kB 1.9MB/s eta 0:00:01[K    6% |██                              | 61kB 2.2MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.5MB/s eta 0:00:01[K    8% |██▋                             | 81kB 2.9MB/s eta 0:00:01[K    9% |███                             | 92kB 3.2MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.6MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.7MB/s eta 0:00:01[K    12% |████                            | 122kB 4.1MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.1MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.5MB/s eta 0:00:01[K