### Setting seeds

In [1]:
import torch
import numpy as np
import random
import os

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def set_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

SEED = 21937
set_seeds(SEED)

SVD_QTY = 150

### Data extraction

In [2]:
import pandas as pd

pd.set_option('max_colwidth', 120)

train_dset = pd.read_csv("../input/google-quest-challenge/train.csv", index_col='qa_id')
test_dset = pd.read_csv("../input/google-quest-challenge/test.csv", index_col='qa_id')

Selection of columns based of possibility of feature extraction:

In [3]:
free_text_columns = ['question_title', 'question_body', 'answer']

category_columns = ['host', 'category']

discard_columns = ['question_user_name', 'question_user_page',  'answer_user_name', 'answer_user_page', 'url']

target_columns = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
                  'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
                  'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
                  'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
                  'question_type_compare', 'question_type_consequence', 'question_type_definition',
                  'question_type_entity', 'question_type_instructions', 'question_type_procedure',
                  'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
                  'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
                  'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
                  'answer_type_reason_explanation', 'answer_well_written']

In [4]:
y_train = train_dset[target_columns].copy()
train_dset = train_dset.drop(target_columns+discard_columns, axis=1)

test_ids = test_dset.index
test_dset = test_dset.drop(discard_columns, axis=1)

### Preprocess

NLP Tools class

In [5]:
from nltk import (sent_tokenize,
                  word_tokenize,
                  pos_tag)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import re


class NlpUtils:

    def __init__(self):
        self.html_regex = re.compile('<.*?>')
        self.stopwords_set = set(stopwords.words('english'))

    def nlp_text(self, corpus: str) -> list:
        """
        Processes a text by transforming it into a cleaned and lemmatized str, also counting PoSTags
        """
        lemmatizer = WordNetLemmatizer()
        corpus = self._text_cleanup(corpus.strip())
        pos_tag_counter = defaultdict(int)
        corpus_tokens = []
        if len(corpus) > 0:
            corpus, pos_tag_counter, corpus_tokens = self._stopword_postag_lemma(corpus, lemmatizer)
        return corpus, pos_tag_counter, corpus_tokens

    def _text_cleanup(self, corpus: str) -> str:
        """ General purpose text cleaner (removes html code and unnecessary punctuation for NLP) """
        corpus = corpus.lower()
        if corpus != ' ' and re.search(r'\w', corpus):
            corpus = re.sub(self.html_regex, '', corpus)    # Remove html code blocks
            corpus = re.sub(r'[;:,.!?\n\r]', '.', corpus).strip()   # Replace all sentence split punctuation by dot
            if re.search(r'\.+[a-z]', corpus):    # Sentence punctuation followed directly by char separation
                corpus = re.sub(r'\.+[a-z]', '. ', corpus)
            corpus = re.sub(r'[\"\'-()#@<>{}`+=~|\[\]]', ' ', corpus)   # Clean rest of usual symbols
            if not corpus.endswith(".") or corpus.endswith(" ."):
                corpus += "."
        else:
            corpus = ""
        return corpus

    def _split_delete_stopwords(self, text: str) -> (list, list):
        """
        English stopword removal from a text and word tokenizer
        """
        text_words = word_tokenize(text)
        text_tokens_filtered = [word for word in text_words if word not in self.stopwords_set]
        text_words = list(filter(lambda x: x != '.', text_words))
        return text_tokens_filtered, text_words

    def _stopword_postag_lemma(self, text: str, lemmatizer: WordNetLemmatizer) -> (str, dict, list):
        """ Word Tokenizes a text, removing stopwords and then lemmatizes by PoSTagging (keeps ADJ, ADV, NOUN, VERB)
            Returns cleaned text (no stopwords and lemmatized), PoSTag stats and word tokenized text """
        text_tokens_filtered, text_tokens = self._split_delete_stopwords(text)

        sentence_pos_tag = pos_tag(text_tokens_filtered)
        sentence_pos_tag_lemmas = []
        pos_tag_counter = defaultdict(int)
        for word, tag in sentence_pos_tag:
            pos_tag_counter[tag] += 1
            wordnet_tag = tag[0].lower()
            wordnet_tag = wordnet_tag if wordnet_tag in ['a', 'r', 'n', 'v'] else None
            if not wordnet_tag:
                lemma = word
            else:
                lemma = lemmatizer.lemmatize(word, wordnet_tag)
            sentence_pos_tag_lemmas.append(lemma)
        result = ' '.join(sentence_pos_tag_lemmas)
        return result, pos_tag_counter, text_tokens

Feature engineering tools

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from category_encoders.one_hot import OneHotEncoder


nlp_utils = NlpUtils()
POS_TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS',
            'PDT', 'POS', 'PRP', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
            'VBZ', 'WDT', 'WP', 'WP', 'WRB']
STOPWORDS_SET = set(stopwords.words('english'))
PUNCTUATION_SET = {';', ':', ',', '.', '!', '?', '\n', '\r', '-', '(', ')', '`', '$', '<', '>', '=', '+'}
QUESTION_WORDS = {'who', 'what', 'why', 'how', 'where', 'when', 'with', 'whose', 'whom', 'if', 'or'}


def preprocess_text(text):
    return text.apply(lambda x: pd.Series(nlp_utils.nlp_text(x)))


def oh_encoder() -> Pipeline:
    return Pipeline([('OHE', OneHotEncoder(drop_invariant=True))], verbose=True)


def tfidf_pipeline() -> Pipeline:
    return Pipeline([
        ('Text-TF-IDF', TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=2, max_df=0.7, lowercase=False)),
        ('Text-SVD', TruncatedSVD(n_components=SVD_QTY))], verbose=True)


def custom_transformer(method) -> Pipeline:
    return Pipeline([
        ('Custom Function', FunctionTransformer(method, validate=False)),
        ], verbose=True)


@custom_transformer
def char_count(text: pd.Series) -> int:
    return pd.DataFrame(text.apply(lambda row: len(row)))


@custom_transformer
def word_count(text: pd.Series) -> int:
    """ Given a word tokenized text it returns the quantity of words"""
    return pd.DataFrame(text.apply(lambda row: len(row)))


@custom_transformer
def unique_word_count(text: pd.Series) -> int:
    """ Given a preprocessed text it returns the quantity of unique words """
    return pd.DataFrame(text.apply(lambda row: len(set(word_tokenize(row)))))


@custom_transformer
def sentence_count(text: pd.Series) -> int:
    return pd.DataFrame(text.apply(lambda row: len(sent_tokenize(row))))


def postag_ratio_calc(postag_dict: dict) -> pd.DataFrame:
    total_tags = sum(postag_dict.values())
    postag_counter = {}
    for tag in POS_TAGS:
        try:
            postag_counter[tag] = postag_dict.get(tag, 0) / total_tags
        except ZeroDivisionError:
            postag_counter[tag] = 0.0
    return pd.DataFrame(postag_counter, index=[0]).iloc[0]


@custom_transformer
def postag_ratio(postag_dict: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(postag_dict.apply(lambda row: postag_ratio_calc(row)))


def match_word_calc(text_1: str, text_2: str) -> (float, float):
    words_text_1 = set(text_1.replace('.', '').split(' '))
    words_text_2 = set(text_2.replace('.', '').split(' '))
    match_len = len(words_text_1 & words_text_2)
    try:
        text_1_ratio = match_len / len(words_text_1)
        text_2_ratio = match_len / len(words_text_2)
    except ZeroDivisionError:
        text_1_ratio = text_2_ratio = 0.0
    return text_1_ratio, text_2_ratio


@custom_transformer
def matching_words(text_cols: pd.DataFrame) -> pd.DataFrame:
    return text_cols.apply(lambda row: match_word_calc(row[0], row[1]), result_type='expand', axis=1)


def stopword_ratio_calc(tokenized_text: str) -> float:
    word_count = len(tokenized_text)
    stopword_count = sum([1 if word.lower() in STOPWORDS_SET else 0 for word in tokenized_text])
    try:
        return stopword_count/word_count
    except ZeroDivisionError:
        return 0.0


@custom_transformer
def stopword_ratio(text: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(text.apply(lambda row: stopword_ratio_calc(row)))


def uppercase_ratio_calc(text: str) -> (float, float):
    tokens = word_tokenize(text)
    word_qty = len(tokens)
    word_uppercase_count = sum([1 if word[0].isupper() else 0 for word in tokens])
    char_qty = len(text)
    total_uppercase_count = len(re.findall(r'[A-Z]', text))
    try:
        word_ratio = word_uppercase_count / word_qty
        char_ratio = total_uppercase_count / char_qty
    except ZeroDivisionError:
        word_ratio = char_ratio = 0.0
    return word_ratio, char_ratio


@custom_transformer
def uppercase_ratio(text: pd.Series) -> pd.DataFrame:
    """ Given a word tokenized text it returns the ratio of words that begin with uppercase
        and the ratio of uppercase letters"""
    return text.to_frame().apply(lambda row: uppercase_ratio_calc(row[0]), result_type='expand', axis=1)


def punctuation_count_calc(text: str) -> pd.DataFrame:
    punct_counter = defaultdict(int)
    for i, punctuation in enumerate(PUNCTUATION_SET):
        punct_counter['key'+str(i)] = len(re.findall('[{}]'.format(punctuation), text))
    return pd.DataFrame(punct_counter, index=[0]).iloc[0]


@custom_transformer
def punctuation_count(text: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(text.apply(lambda row: punctuation_count_calc(row)))


def qwords_count_calc(text: str) -> pd.DataFrame:
    text = text.lower()
    qword_counter = defaultdict(int)
    for i, question in enumerate(QUESTION_WORDS):
        qword_counter['key'+str(i)] = len(re.findall('[{}]'.format(question), text))
    return pd.DataFrame(qword_counter, index=[0]).iloc[0]


@custom_transformer
def qwords_count(text: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(text.apply(lambda row: qwords_count_calc(row)))


@custom_transformer
def number_count(text: pd.Series) -> pd.DataFrame:
    return pd.DataFrame(text.apply(lambda row: len(re.findall(r'[0-9]', row))))

Dataset Cleanup

In [7]:
for col in free_text_columns:
    train_dset[['clean_'+col, 'postags_'+col, 'tokens_'+col]] = preprocess_text(train_dset[col])
    test_dset[['clean_'+col, 'postags_'+col, 'tokens_'+col]] = preprocess_text(test_dset[col])

train_dset['qt_qb'] = train_dset['question_title'] + '. ' + train_dset['question_body']
train_dset['qb_a'] = train_dset['question_body'] + '. ' + train_dset['answer']
train_dset['clean_qt_qb'] = train_dset['clean_question_title'] + '. ' + train_dset['clean_question_body']
train_dset['clean_qb_a'] = train_dset['clean_question_body'] + '. ' + train_dset['clean_answer']

test_dset['qt_qb'] = test_dset['question_title'] + '. ' + test_dset['question_body']
test_dset['qb_a'] = test_dset['question_body'] + '. ' + test_dset['answer']
test_dset['clean_qt_qb'] = test_dset['clean_question_title'] + '. ' + test_dset['clean_question_body']
test_dset['clean_qb_a'] = test_dset['clean_question_body'] + '. ' + test_dset['clean_answer']

Features creation

In [8]:
from sklearn.compose import ColumnTransformer


text_encoder = tfidf_pipeline()
ohe = oh_encoder()

preprocess = ColumnTransformer([
    ('qt_encoded', text_encoder, 'question_title'),
    ('qb_encoded', text_encoder, 'question_body'),
    ('qtb_encoded', text_encoder, 'qt_qb'),
    ('a_encoded', text_encoder, 'answer'),
    ('qa_encoded', text_encoder, 'qt_qb'),

    ('clean_qt_encoded', text_encoder, 'clean_question_title'),
    ('clean_qb_encoded', text_encoder, 'clean_question_body'),
    ('clean_qtb_encoded', text_encoder, 'clean_qt_qb'),
    ('clean_a_encoded', text_encoder, 'clean_answer'),
    ('clean_qa_encoded', text_encoder, 'clean_qt_qb'),

    ('qt_char_count', char_count, 'question_title'),
    ('qb_char_count', char_count, 'question_body'),
    ('a_char_count', char_count, 'answer'),
    ('clean_qt_char_count', char_count, 'clean_question_title'),
    ('clean_qb_char_count', char_count, 'clean_question_body'),
    ('clean_a_char_count', char_count, 'clean_answer'),

    ('qt_word_count', word_count, 'tokens_question_title'),
    ('qb_word_count', word_count, 'tokens_question_body'),
    ('a_word_count', word_count, 'tokens_answer'),

    ('qt_unique_word_count', unique_word_count, 'clean_question_title'),
    ('qb_unique_word_count', unique_word_count, 'clean_question_body'),
    ('a_unique_word_count', unique_word_count, 'clean_answer'),

    ('qt_sentence_count', sentence_count, 'clean_question_title'),
    ('qb_sentence_count', sentence_count, 'clean_question_body'),
    ('a_sentence_count', sentence_count, 'clean_answer'),

    ('qt_postag_ratio', postag_ratio, 'postags_question_title'),
    ('qb_postag_ratio', postag_ratio, 'postags_question_body'),
    ('a_postag_ratio', postag_ratio, 'postags_answer'),

    ('qtb_match_ratio', matching_words, ['clean_question_title', 'clean_question_body']),
    ('qta_match_ratio', matching_words, ['clean_question_title', 'clean_answer']),
    ('qba_match_ratio', matching_words, ['clean_question_body', 'clean_answer']),

    ('qt_stopword_ratio', stopword_ratio, 'tokens_question_title'),
    ('qb_stopword_ratio', stopword_ratio, 'tokens_question_body'),
    ('a_stopword_ratio', stopword_ratio, 'tokens_answer'),

    ('qt_uppercase_ratio', uppercase_ratio, 'question_title'),
    ('qb_uppercase_ratio', uppercase_ratio, 'question_body'),
    ('a_uppercase_ratio', uppercase_ratio, 'answer'),

    ('qt_punctuation_count', punctuation_count, 'question_title'),
    ('qb_punctuation_count', punctuation_count, 'question_body'),
    ('a_punctuation_count', punctuation_count, 'answer'),

    ('qt_qwords_count', qwords_count, 'question_title'),
    ('qb_qwords_count', qwords_count, 'question_body'),
    ('a_qwords_count', qwords_count, 'answer'),

    ('qt_number_count', number_count, 'question_title'),
    ('qb_number_count', number_count, 'question_body'),
    ('a_number_count', number_count, 'answer'),

    ('host_ohe', ohe, 'host'),
    ('category_ohe', ohe, 'category')
    ], verbose=True)

In [9]:
x_train = preprocess.fit_transform(train_dset)
y_train = y_train.values
x_test = preprocess.transform(test_dset)

[Pipeline] ....... (step 1 of 2) Processing Text-TF-IDF, total=   0.5s
[Pipeline] .......... (step 2 of 2) Processing Text-SVD, total=   2.4s
[ColumnTransformer] ... (1 of 48) Processing qt_encoded, total=   2.9s
[Pipeline] ....... (step 1 of 2) Processing Text-TF-IDF, total=   7.3s
[Pipeline] .......... (step 2 of 2) Processing Text-SVD, total=  31.7s
[ColumnTransformer] ... (2 of 48) Processing qb_encoded, total=  38.9s
[Pipeline] ....... (step 1 of 2) Processing Text-TF-IDF, total=   7.9s
[Pipeline] .......... (step 2 of 2) Processing Text-SVD, total=  31.9s
[ColumnTransformer] .. (3 of 48) Processing qtb_encoded, total=  39.8s
[Pipeline] ....... (step 1 of 2) Processing Text-TF-IDF, total=  10.4s
[Pipeline] .......... (step 2 of 2) Processing Text-SVD, total=  10.8s
[ColumnTransformer] .... (4 of 48) Processing a_encoded, total=  21.1s
[Pipeline] ....... (step 1 of 2) Processing Text-TF-IDF, total=   7.8s
[Pipeline] .......... (step 2 of 2) Processing Text-SVD, total=  32.3s
[Colum

In [10]:
from sklearn.preprocessing import StandardScaler, Normalizer

whitening_preprocess = Pipeline([
                                ('Standarization', StandardScaler()),
                                ('Normalizer', Normalizer()),
                        ], verbose=True)

x_train = whitening_preprocess.fit_transform(x_train)
x_test = whitening_preprocess.transform(x_test)

[Pipeline] .... (step 1 of 2) Processing Standarization, total=   0.3s
[Pipeline] ........ (step 2 of 2) Processing Normalizer, total=   0.1s


### NN Model definition

In [11]:
import torch.nn as nn

from torch.nn import Sequential
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn.utils.weight_norm import weight_norm

from torch.nn import BCELoss
from torch.optim import Adam


class PyTorch:
    
    def __init__(self, in_features, out_features, n_epochs, patience):
        self.in_features = in_features
        self.out_features = out_features
        self.n_epochs = n_epochs
        self.patience = patience
    
    
    def init_model(self):
        
        # define a model
        self.model = Sequential(
            weight_norm(Linear(self.in_features, 128)),
            ReLU(),
            weight_norm(Linear(128, 128)),
            ReLU(),
            weight_norm(Linear(128, self.out_features)),
            Sigmoid()
        )
        
        # initialize model
        for t in self.model:
            if isinstance(t, Linear):
                nn.init.kaiming_normal_(t.weight_v)
                nn.init.kaiming_normal_(t.weight_g)
                nn.init.constant_(t.bias, 0)
        
        # define loss function
        self.loss_func = BCELoss()
        
        # define optimizer
        self.optimizer = Adam(self.model.parameters(), lr=1e-3)
    
    
    def fit(self, x_train, y_train, x_valid, y_valid):
        
        validate = (x_valid is not None) & (y_valid is not None)
        
        self.init_model()
        
        x_train_tensor = torch.as_tensor(x_train, dtype=torch.float32)
        y_train_tensor = torch.as_tensor(y_train, dtype=torch.float32)
        
        if validate:
            x_valid_tensor = torch.as_tensor(x_valid, dtype=torch.float32)
            y_valid_tensor = torch.as_tensor(y_valid, dtype=torch.float32)
        
        min_loss = np.inf
        max_spear = 0
        counter = 0
        
        for epoch in range(self.n_epochs):
            
            self.model.train()
            y_pred = self.model(x_train_tensor)
            loss = self.loss_func(y_pred, y_train_tensor)
            
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            current_loss = loss.item()

            oof_part = self.predict(x_valid)
    
            spear, rho_cols = compute_spearmanr(oof_part, y_valid)
            print(f'epoch = {epoch}, train_loss = {current_loss}, valid_spearman = {spear}')
            
            # early stopping
            # if current_loss < min_loss:
            if spear >= max_spear:
                # min_loss = current_loss
                max_spear = spear
                counter = 0
            else:
                counter += 1
                # print('Early stopping: %i / %i' % (counter, self.patience))
                if counter >= self.patience:
                    # print('Early stopping at epoch', epoch + 1)
                    break
        return estimator, rho_cols 
    
    def predict(self, x):
        x_tensor = torch.as_tensor(x, dtype=torch.float32)
        self.model.eval()
        with torch.no_grad():
            return self.model(x_tensor).numpy()

In [12]:
from scipy.stats import spearmanr, rankdata


def compute_spearmanr(preds, trues):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        rhos.append(
            spearmanr(col_trues, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.nanmean(rhos), rhos


### Model training

In [13]:
from sklearn.model_selection import GroupKFold
import math


n_splits = 5
scores = []

gkf = GroupKFold(n_splits=n_splits).split(X=train_dset.question_body, groups=train_dset.question_body)

pytorch_params = {
    'in_features': x_train.shape[1],
    'out_features': y_train.shape[1],
    'n_epochs': 2500,
    'patience': 2
}

trained_estimators = []

for fold, (train_idx, valid_idx) in enumerate(gkf):
    x_train_train = x_train[train_idx]
    y_train_train = y_train[train_idx]
    x_train_valid = x_train[valid_idx]
    y_train_valid = y_train[valid_idx]

    print(f'fold = {fold}')
    estimator = PyTorch(**pytorch_params)
    estimator, rho_cols = estimator.fit(x_train_train, y_train_train, x_train_valid, y_train_valid)

    trained_estimators.append(estimator)
    rho_print = [print(target_columns[i] + " rho: " + str(rho_cols[i]) ) for i in range(0, len(target_columns))]

y_pred = []
for estimator in trained_estimators:
    y_pred.append(estimator.predict(x_test))

fold = 0
epoch = 0, train_loss = 0.6956472396850586, valid_spearman = 0.012171345817698048
epoch = 1, train_loss = 0.6926380395889282, valid_spearman = 0.021133478186705953
epoch = 2, train_loss = 0.6897417902946472, valid_spearman = 0.02798886970732094
epoch = 3, train_loss = 0.6869376301765442, valid_spearman = 0.033754439036839966
epoch = 4, train_loss = 0.6841873526573181, valid_spearman = 0.03858777260272057
epoch = 5, train_loss = 0.6814516186714172, valid_spearman = 0.042014357280886414
epoch = 6, train_loss = 0.678670346736908, valid_spearman = 0.04462086615486972
epoch = 7, train_loss = 0.6758150458335876, valid_spearman = 0.047135798605950456
epoch = 8, train_loss = 0.6728708744049072, valid_spearman = 0.04920068295520463
epoch = 9, train_loss = 0.6697971224784851, valid_spearman = 0.05154788223960116
epoch = 10, train_loss = 0.6666020154953003, valid_spearman = 0.05390074758550809
epoch = 11, train_loss = 0.6632542014122009, valid_spearman = 0.056534353193097776
epoch = 12, 

### Submission

In [14]:
submission = pd.read_csv("../input/google-quest-challenge/sample_submission.csv", index_col='qa_id')

out = pd.DataFrame(index=submission.index)

for column_idx,column in enumerate(target_columns):
    column_data = pd.DataFrame(index=submission.index)
    for prediction_idx,prediction in enumerate(y_pred):
        column_data[str(prediction_idx)] = prediction[:, column_idx]
    
    out[column] = np.average(column_data, axis=1)

out.to_csv("submission.csv")