In [1]:
#!/usr/bin/env python
# coding: utf-8

from __future__ import division, print_function
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import random
from copy import deepcopy
from datetime import datetime
import time
import gc
import os

from sklearn import model_selection, preprocessing, metrics, linear_model
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
from sklearn.utils import shuffle
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim.optimizer import Optimizer
from torch.utils.data import Dataset, TensorDataset, DataLoader
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import re
import string
from unidecode import unidecode
from gensim.models import KeyedVectors
import gensim
import nltk
from wordcloud import STOPWORDS
import operator
from collections import Counter

begin_time = datetime.now()

embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use
batch_size = 512
seed = 1029

val = False
gpu_device_count = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_device_count   #指定第一块GPU可用

def set_seed(seed=2018):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    tf.set_random_seed(seed)
set_seed(seed)



puncts=['ɖ', '✔', ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '谢', '六', '佬', '|', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', 'é', '&amp;', '₹', 'á', '²', 'ế', '청', '하', '¨', '‘', '√', '\xa0', '高', '端', '大', '气', '上', '档', '次', '_', '½', 'π', '#', '小', '鹿', '乱', '撞', '成', '语', 'ë', 'à', 'ç', '@', 'ü', 'č', 'ć', 'ž', 'đ', '°', 'द', 'े', 'श', '्', 'र', 'ो', 'ह', 'ि', 'प', 'स', 'थ', 'त', 'न', 'व', 'ा', 'ल', 'ं', '林', '彪', '€', '\u200b', '˚', 'ö', '~', '—', '越', '人', 'च', 'म', 'क', 'ु', 'य', 'ी', 'ê', 'ă', 'ễ', '∞', '抗', '日', '神', '剧', '，', '\uf02d', '–', 'ご', 'め', 'な', 'さ', 'い', 'す', 'み', 'ま', 'せ', 'ん', 'ó', 'è', '£', '¡', 'ś', '≤', '¿', 'λ', '魔', '法', '师', '）', 'ğ', 'ñ', 'ř', '그', '자', '식', '멀', '쩡', '다', '인', '공', '호', '흡', '데', '혀', '밀', '어', '넣', '는', '거', '보', '니', 'ǒ', 'ú', '️', 'ش', 'ه', 'ا', 'د', 'ة', 'ل', 'ت', 'َ', 'ع', 'م', 'ّ', 'ق', 'ِ', 'ف', 'ي', 'ب', 'ح', 'ْ', 'ث', '³', '饭', '可', '以', '吃', '话', '不', '讲', '∈', 'ℝ', '爾', '汝', '文', '言', '∀', '禮', 'इ', 'ब', 'छ', 'ड', '़', 'ʒ', '有', '「', '寧', '錯', '殺', '一', '千', '絕', '放', '過', '」', '之', '勢', '㏒', '㏑', 'ू', 'â', 'ω', 'ą', 'ō', '精', '杯', 'í', '生', '懸', '命', 'ਨ', 'ਾ', 'ਮ', 'ੁ', '₁', '₂', 'ϵ', 'ä', 'к', 'ɾ', '\ufeff', 'ã', '©', '\x9d', 'ū', '™', '＝', 'ù', 'ɪ', 'ŋ', 'خ', 'ر', 'س', 'ن', 'ḵ', 'ā']
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def token_text(x):
    x = str(x)
    return nltk.word_tokenize(x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim),
            self.weight
        ).view(-1, step_dim)

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)


class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma ** (x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs







def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    #     test_df = pd.read_csv("../input/test.csv")
    if val:
        test_df = train_df[:130000].reset_index(drop=True)
        train_df = train_df[130000:].reset_index(drop=True)
    else:
        test_df = pd.read_csv("../input/test.csv")
    train_df["question_text"] = train_df["question_text"].str.lower()
    test_df["question_text"] = test_df["question_text"].str.lower()
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    train_df["question_text"].fillna("_##_")
    test_df["question_text"].fillna("_##_")

    train_df["question_text"] = train_df["question_text"].apply(lambda x: token_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: token_text(x))
    train_X = train_df["question_text"].values
    test_X = test_df["question_text"].values

    pos_df = train_df[(train_df['target'] == 1)]
    pos_X = train_df['question_text'].values
    counts = Counter()

    for sentence in pos_X:
        counts.update(sentence)
        counts.update(sentence)

    for sentence in train_X:
        sentence = list(set(sentence))
        counts.update(sentence)
    counts = sorted(counts.items(), key=operator.itemgetter(1))
    counts.reverse()
    counts = dict(counts)
    
    
    ##mispell words
    mis_words_dic={'britebart': 'breitbart', 'earg': 'urge', 'houthies': 'houthis', 'bipan': 'bipin', 'hags': 'hag', 'bullyboy': 'bully', 'fyck': 'fuck', 'dwelve': 'delve', 'radivals': 'radical', 'winy': 'whiny', 'mysoginists': 'misogynist', 'catergorized': 'categorized', 'consecrated': 'consecrate', 'agancy': 'agency', 'cuckservatives': 'conservatives', 'gerrymanders': 'gerrymander', 'canadia': 'canadian', 'babaric': 'barack', 'schoolteachers': 'teachers', 'kebba': 'kaaba', 'ladys': 'ladies', 'gunarathne': 'gunaratne', 'howmother': 'how', 'cooties': 'cock', 'teeenage': 'teenage', 'owaesi': 'owaisi', 'supplants': 'supplant', 'terming': 'term', 'mongolianz': 'mongolians', 'extorts': 'extort', 'tertullians': 'tertullian', 'paraniod': 'paranoid', 'wakandan': 'wakanda', 'transgeneric': 'transgender', 'himen': 'hymen', 'illbred': 'bred', 'transgeder': 'transgender', 'goofle': 'google', 'persecutor': 'prosecutor', 'fucktard': 'fuck', 'emboldens': 'embolden', 'jalikatu': 'jallikattu', 'fashist': 'fascist', 'earf': 'earth', 'causians': 'caucasians', 'dishornable': 'dishonorable', 'antimodi': 'modi', 'detoxifying': 'detoxing', 'preodential': 'presidential', 'ccausing': 'causing', 'lyncing': 'lynching', 'exonorated': 'exonerated', 'woemn': 'women', 'cinhese': 'chinese', 'holoucast': 'holocaust', 'kenpeitai': 'kempeitai', 'fonies': 'phonies', 'bibicial': 'biblical', 'antiscience': 'anti', 'grothesque': 'grotesque', 'stokholm': 'stockholm', 'homophilia': 'homosexuality', 'gorrilas': 'gorillas', 'chaibala': 'chaiwala', 'teluguians': 'telugu', 'hadrat': 'hazrat', 'ᗰoᖇe': 'more', 'nonpracticing': 'practicing', 'statistucs': 'statistics', 'unacceptabing': 'unacceptable', 'globed': 'globe', 'sommons': 'summons', 'ussually': 'usually', 'centimiters': 'centimeters', 'phoniness': 'phony', 'overral': 'overall', "apist": "rapist", "pachabottu": "pacha", "mycaster": "code", "loundly": "loudly", "fetxh": "data", "clickbait": "click", "tigernut": "tiger", "indiaqr": "code", "narcist": "narcissist", "quorreled": "quarreled", "bharathnatyam": "bharatnatyam", "sinnister": "sinister", "careamics": "ceramics", "diffferently": "differently", "kubernetes": "software", "maintanable": "maintainable", "nikodym": "radon", "thunderstike": "thunderstrike", "thaads": "thaad", "arichtecture": "architecture", "simlarity": "similarity", "walmartlabs": "walmart", "padmaavat": "padmavati", "swagatham":"program","cleanshot": "clean", "bandhup": "bhandup", "ramk": "rank", "examinaton": "examination", "imucet": "cet", "mongorestore": "linux", "mongodump": "linux", "pakkstani": "pakistani", "venuas": "venus", "savegely": "savagely", "redmi": "iphone", "anizara": "aniraza", "nanodegree": "degree", "calead": "leap"}
    mis_words = set(list(mis_words_dic.keys()))

    oov_words = []
    all_words_dic = {}
    word_dict = {}
    j = 1

    for word, i in counts.items():
        if word in mis_words:
            oov_words.append(word)
            continue
        word_dict[word] = j
        all_words_dic[word] = j
        j += 1
        if j == max_features:
            break
    for word in oov_words:
        try:
            all_words_dic[word] = word_dict[mis_words_dic[word]]
        except KeyError:
            pass

    def pad_sentence(sentence):
        ####把问号放在最后面
        lenth = len(sentence)
        list1 = []
        temp = []
        for i in range(lenth):
            word = sentence[i]
            temp.append(word)
            if word == '?':
                list1 = temp + list1
                temp = []
        list1 = temp + list1
        ###把问号放在最后面
        list1.reverse()
        list2 = []
        list3 = []
        for word in list1:
            try:
                list3.append(all_words_dic[word])
            except KeyError:
                pass
        l1 = len(list3)
        for i in range(maxlen):
            if i < l1:
                list2.append(list3[i])
            else:
                list2.append(0)
        list2.reverse()
        return list2

    if val:
        train_x = []
        for sentence in train_X:
            train_x.append(pad_sentence(sentence))
        test_x = []
        for sentence in test_X:
            test_x.append(pad_sentence(sentence))
        train_y = train_df['target'].values
        test_y = test_df['target'].values
        train_x = np.array(train_x)
        test_x = np.array(test_x)
        return train_x, test_x, train_y, test_y, word_dict
    else:
        train_x = []
        for sentence in train_X:
            train_x.append(pad_sentence(sentence))
        test_x = []
        for sentence in test_X:
            test_x.append(pad_sentence(sentence))
        train_y = train_df['target'].values
        train_x = np.array(train_x)
        test_x = np.array(test_x)
        return train_x, test_x, train_y, word_dict


# **Load embeddings**
def load_glove_fast(word_index):
    up_dict = {}
    mask = np.zeros((max_features,))
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    emb_mean, emb_std = -0.005838499, 0.48782197
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                if word.lower() not in word_index:
                    continue
                embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
                if len(embedding_vector) == 300:
                    up_dict[word.lower()] = embedding_vector
                continue
            i = word_index[word]
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
                mask[i] = 1
    return embedding_matrix, mask

def load_para_fast(word_index):
    up_dict = {}
    mask = np.zeros((max_features,))
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    emb_mean, emb_std = -0.005838499, 0.48782197
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8", errors='ignore') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                if word.lower() not in word_index:
                    continue
                embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
                if len(embedding_vector) == 300:
                    up_dict[word.lower()] = embedding_vector
                continue
            i = word_index[word]
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
                mask[i] = 1
    return embedding_matrix, mask

def load_fast_fast(word_index):
    up_dict = {}
    mask = np.zeros((max_features,))
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    emb_mean, emb_std = -0.005838499, 0.48782197
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8", errors='ignore') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                if word.lower() not in word_index:
                    continue
                embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
                if len(embedding_vector) == 300:
                    up_dict[word.lower()] = embedding_vector
                continue
            i = word_index[word]
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
                mask[i] = 1
    return embedding_matrix, mask




'''split-data'''
submit_mode = 0
if val:
    path = './spnpy_val'
else:
    path = './spnpy_sub'
if submit_mode == 0:
    start_time = time.time()
    if val:
        train_X, test_X, train_y, test_y, word_index = load_and_prec()
    else:
        train_X, test_X, train_y, word_index = load_and_prec()
    print("time :", time.time() - start_time)
    print(train_X.shape, test_X.shape, train_y.shape, len(word_index))

    start_time = time.time()
    embedding_matrix_g, mask_g = load_glove_fast(word_index)
    embedding_matrix_p, mask_p = load_para_fast(word_index)
    embedding_matrix_f, mask_f = load_fast_fast(word_index)
    embedding_matrix_gp = np.mean([embedding_matrix_g, embedding_matrix_p, embedding_matrix_f], axis=0)
    print("time :", time.time() - start_time)

elif submit_mode == 1:
    start_time = time.time()
    if val:
        train_X, test_X, train_y, test_y, word_index = load_and_prec()
    else:
        train_X, test_X, train_y, word_index = load_and_prec()
    print("time :", time.time() - start_time)
    print(train_X.shape, test_X.shape, train_y.shape, len(word_index))

    start_time = time.time()
    embedding_matrix_g, mask_g = load_glove_fast(word_index)
    embedding_matrix_p, mask_p = load_para_fast(word_index)
    embedding_matrix_f, mask_f = load_fast_fast(word_index)
    embedding_matrix_gp = np.mean([embedding_matrix_g, embedding_matrix_p, embedding_matrix_f], axis=0)
    print("time :", time.time() - start_time)

    if not os.path.isdir(path):
        os.mkdir(path)

    np.save(path + "/X_train.npy", train_X)
    np.save(path + "/X_test.npy", test_X)
    np.save(path + "/y_train.npy", train_y)
    if val:
        np.save(path + "/y_test.npy", test_y)
    np.save(path + "/vocab.npy", word_index)

    np.save(path + "/embedding_matrix_g.npy", embedding_matrix_g)
    np.save(path + "/embedding_matrix_p.npy", embedding_matrix_p)
    np.save(path + "/embedding_matrix_gp.npy", embedding_matrix_gp)

elif submit_mode == 2:
    train_X = np.load(path + "/X_train.npy")
    test_X = np.load(path + "/X_test.npy")
    train_y = np.load(path + "/y_train.npy")
    if val:
        test_y = np.load(path + "/y_test.npy")
    word_index = np.load(path + "/vocab.npy")
    print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

    embedding_matrix_g = np.load(path + "/embedding_matrix_g.npy")
    embedding_matrix_p = np.load(path + "/embedding_matrix_p.npy")
    embedding_matrix_gp = np.load(path + "/embedding_matrix_gp.npy")


if val:
    x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
    y_test_cuda = torch.tensor(test_y[:, np.newaxis], dtype=torch.float32).cuda()
    test = torch.utils.data.TensorDataset(x_test_cuda, y_test_cuda)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    x_train_fold = torch.tensor(train_X, dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[:, np.newaxis], dtype=torch.float32).cuda()
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
else:
    x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
    test = torch.utils.data.TensorDataset(x_test_cuda)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    x_train_fold = torch.tensor(train_X, dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[:, np.newaxis], dtype=torch.float32).cuda()
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)


# ****************************** model 1 ******************************
def model1(epochs=4, embedding_matrix=None, clip=False):
    hidden_size = 60
    gru_len = hidden_size
    Routings = 4
    Num_capsule = 5
    Dim_capsule = 5
    T_epsilon = 1e-7

    
    class Caps_Layer(nn.Module):
        def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                     routings=Routings, kernel_size=(9, 1), share_weights=True,
                     activation='default', **kwargs):
            super(Caps_Layer, self).__init__(**kwargs)

            self.num_capsule = num_capsule
            self.dim_capsule = dim_capsule
            self.routings = routings
            self.kernel_size = kernel_size  # 暂时没用到
            self.share_weights = share_weights
            if activation == 'default':
                self.activation = self.squash
            else:
                self.activation = nn.ReLU(inplace=True)

            if self.share_weights:
                self.W = nn.Parameter(
                    nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
            else:
                self.W = nn.Parameter(
                    torch.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))

        def forward(self, x):
            if self.share_weights:
                u_hat_vecs = torch.matmul(x, self.W)
            else:
                print('add later')
            batch_size = x.size(0)
            input_num_capsule = x.size(1)
            u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                          self.num_capsule, self.dim_capsule))
            u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)
            b = torch.zeros_like(u_hat_vecs[:, :, :, 0])

            for i in range(self.routings):
                b = b.permute(0, 2, 1)
                c = F.softmax(b, dim=2)
                c = c.permute(0, 2, 1)
                b = b.permute(0, 2, 1)
                outputs = self.activation(torch.einsum('bij,bijk->bik', (c, u_hat_vecs)))
                
                if i < self.routings - 1:
                    b = torch.einsum('bik,bijk->bij', (outputs, u_hat_vecs))
            return outputs  # (batch_size, num_capsule, dim_capsule)

        def squash(self, x, axis=-1):
            s_squared_norm = (x ** 2).sum(axis, keepdim=True)
            scale = torch.sqrt(s_squared_norm + T_epsilon)
            return x / scale


    class NeuralNet(nn.Module):
        def __init__(self):
            super(NeuralNet, self).__init__()

            linear_unit = 16
            linear_unit1 = 16

            self.embedding = nn.Embedding(max_features, embed_size)
            self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
            self.embedding.weight.requires_grad = False

            self.embedding_dropout = nn.Dropout2d(0.1)
            self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
            self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
            self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

            self.lstm_atten = Attention(hidden_size * 2, maxlen)
            self.gru_atten = Attention(hidden_size * 2, maxlen)

            self.linear = nn.Linear(hidden_size * 8 + 1, linear_unit1)
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(0.1)
            self.fc = nn.Linear(linear_unit ** 2, linear_unit)
            self.out = nn.Linear(linear_unit, 1)
            self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
            self.caps_layer = Caps_Layer()

        def forward(self, x):

            embeds = self.embedding(x)
            embeds = torch.squeeze(self.embedding_dropout(torch.unsqueeze(embeds, 0)))

            lstm_out, _ = self.lstm(embeds)
            gru_out, _ = self.gru(lstm_out)

            caps_out = self.caps_layer(gru_out)
            caps_out = self.dropout(caps_out)
            caps_out = caps_out.view(caps_out.size(0), -1)
            caps_out = self.relu(self.lincaps(caps_out))

            lstm_atten = self.lstm_atten(lstm_out)
            gru_atten = self.gru_atten(gru_out)

            avg_pool = torch.mean(gru_out, 1)
            max_pool, _ = torch.max(gru_out, 1)
            
            conc = torch.cat((lstm_atten, gru_atten, caps_out, avg_pool, max_pool), 1)
            conc = self.relu(self.linear(conc))
            conc = self.dropout(conc)
            out = self.out(conc)
            return out


    model = NeuralNet()
    model.cuda()

    class_weight = torch.FloatTensor([1.3]).cuda()
    loss_func = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=class_weight).cuda()

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.003)
    scheduler = CyclicLR(optimizer, base_lr=0.001, max_lr=0.003, step_size=300, mode='exp_range', gamma=0.99994)

    for epoch in range(epochs):
        test_preds = np.zeros((len(test_X)))
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        for x_batch, y_batch in train_loader:
            y_pred = model(x_batch)
            if scheduler:
                scheduler.batch_step()
            loss = loss_func(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            if clip:
                nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        model.eval()
        test_preds_epoch = np.zeros(len(test_X))
        avg_val_loss = 0.
        if val:
            for step, (x_batch, y_batch) in enumerate(test_loader):
                y_pred = model(x_batch).detach()
                loss = loss_func(y_pred, y_batch)
                avg_val_loss += loss.item() / len(test_loader)
                test_preds_epoch[step * batch_size:(step + 1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            test_preds += test_preds_epoch
            
            point = int(len(list(test_preds_epoch)) * 0.93065)
            thresh = sorted(list(test_preds_epoch))[point]
            pred_y = (test_preds_epoch > thresh).astype(int)
            score = f1_score(y_pred=pred_y, y_true=test_y)
            print('Epoch [{}/{}] - time: {:.2f}s - loss: {:.4f} - val_loss: {:.4f} - f1 score: {}'.format(epoch+1, epochs, time.time() - start_time, avg_loss, avg_val_loss, score))
        else:
            for step, (x_batch,) in enumerate(test_loader):
                y_pred = model(x_batch).detach()
                test_preds_epoch[step * batch_size:(step + 1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            test_preds += test_preds_epoch
            print('Epoch [{}/{}] - time: {:.2f}s'.format(epoch+1, epochs, time.time() - start_time))
        
    return test_preds
        

# ****************************** model 2 ******************************
def model2(epochs=4, embedding_matrix=None, clip=False):
    
    class NeuralNet(nn.Module):
        def __init__(self):
            super(NeuralNet, self).__init__()
            
            hidden_size = 120
            
            self.embedding = nn.Embedding(max_features, embed_size)
            self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
            self.embedding.weight.requires_grad = False
            
            self.embedding_dropout = nn.Dropout2d(0.1)
            self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
            self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
            
            self.lstm_attention = Attention(hidden_size * 2, maxlen)
            self.gru_attention = Attention(hidden_size * 2, maxlen)
            
            self.linear = nn.Linear(hidden_size * 8, 64)
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(0.1)
            self.out = nn.Linear(64, 1)

        def forward(self, x):
            
            embeds = self.embedding(x)
            embeds = torch.squeeze(self.embedding_dropout(torch.unsqueeze(embeds, 0)))
            
            lstm_out, _ = self.lstm(embeds)
            gru_out, _ = self.gru(lstm_out)
            
            lstm_atten = self.lstm_attention(lstm_out)
            gru_atten = self.gru_attention(gru_out)
            
            avg_pool = torch.mean(gru_out, 1)
            max_pool, _ = torch.max(gru_out, 1)
            
            conc = torch.cat((lstm_atten, gru_atten, avg_pool, max_pool), 1)
            conc = self.relu(self.linear(conc))
            conc = self.dropout(conc)
            out = self.out(conc)
            return out

    model = NeuralNet()
    model.cuda()
    
    class_weight = torch.FloatTensor([1.55]).cuda()
    loss_func = torch.nn.BCEWithLogitsLoss(reduction='sum', pos_weight=class_weight).cuda()
    
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.003)
    scheduler = CyclicLR(optimizer, base_lr=0.001, max_lr=0.003, step_size=300, mode='exp_range', gamma=0.99994)

    for epoch in range(epochs):
        test_preds = np.zeros((len(test_X)))
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        for x_batch, y_batch in train_loader:
            y_pred = model(x_batch)
            if scheduler:
                scheduler.batch_step()
            loss = loss_func(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            if clip:
                nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        model.eval()
        test_preds_epoch = np.zeros(len(test_X))
        avg_val_loss = 0.
        if val:
            for step, (x_batch, y_batch) in enumerate(test_loader):
                y_pred = model(x_batch).detach()
                loss = loss_func(y_pred, y_batch)
                avg_val_loss += loss.item() / len(test_loader)
                test_preds_epoch[step * batch_size:(step + 1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            test_preds += test_preds_epoch
            
            point = int(len(list(test_preds_epoch)) * 0.93065)
            thresh = sorted(list(test_preds_epoch))[point]
            pred_y = (test_preds_epoch > thresh).astype(int)
            score = f1_score(y_pred=pred_y, y_true=test_y)
            print('Epoch [{}/{}] - time: {:.2f}s - loss: {:.4f} - val_loss: {:.4f} - f1 score: {}'.format(epoch+1, epochs, time.time() - start_time, avg_loss, avg_val_loss, score))
        else:
            for step, (x_batch,) in enumerate(test_loader):
                y_pred = model(x_batch).detach()
                test_preds_epoch[step * batch_size:(step + 1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            test_preds += test_preds_epoch
            print('Epoch [{}/{}] - time: {:.2f}s'.format(epoch+1, epochs, time.time() - start_time))
        
    return test_preds


print('>>> 1: ')
pred0 = model1(4, embedding_matrix_gp)

print('>>> 2: ')
pred1 = model1(4, embedding_matrix_g)

print('>>> 3: ')
pred2 = model2(4, embedding_matrix_gp)

print('>>> 4: ')
pred3 = model2(4, embedding_matrix_g)

end_time = datetime.now()
print(">>>Total runtime %s." % (end_time - begin_time))
# ****************************** submit ******************************
if val:
    # test_preds = pred0 + pred1 + pred2 + pred3
    test_preds = 0.22598016 * pred0 + 0.17527643 * pred1 + 0.25782847 * pred2 + 0.28216464 * pred3
    point = int(len(list(test_preds)) * 0.93065)
    thresh = sorted(list(test_preds))[point]
    pred_y = (test_preds > thresh).astype(int)
    score = f1_score(y_pred=pred_y, y_true=test_y)
    print("  f1 score :", score)

else:
    df_test = pd.read_csv("../input/test.csv")
    submission = df_test[['qid']].copy()
    # test_preds = pred0 + pred1 + pred2 + pred3
    test_preds = 0.22598016 * pred0 + 0.17527643 * pred1 + 0.25782847 * pred2 + 0.28216464 * pred3
    point = int(len(list(test_preds)) * 0.9305246930441406)
    thresh = sorted(list(test_preds))[point]
    print(" 阀值:",thresh)
    submission['prediction'] = (test_preds > thresh).astype(int)
    submission.to_csv('submission.csv', index=False)

Using TensorFlow backend.


time : 459.18351459503174
(1306122, 70) (56370, 70) (1306122,) 94999
time : 67.73985266685486
>>> 1: 
Epoch [1/4] - time: 284.54s
Epoch [2/4] - time: 284.70s
Epoch [3/4] - time: 285.39s
Epoch [4/4] - time: 285.49s
>>> 2: 
Epoch [1/4] - time: 284.87s
Epoch [2/4] - time: 285.41s
Epoch [3/4] - time: 285.51s
Epoch [4/4] - time: 285.46s
>>> 3: 
Epoch [1/4] - time: 390.00s
Epoch [2/4] - time: 390.06s
Epoch [3/4] - time: 390.14s
Epoch [4/4] - time: 390.05s
>>> 4: 
Epoch [1/4] - time: 390.15s
Epoch [2/4] - time: 390.07s
Epoch [3/4] - time: 390.06s
Epoch [4/4] - time: 390.07s
>>>Total runtime 1:38:56.990126.
 阀值: 0.3807620794631004
