In [None]:
import random
import re
import typing as t
from pathlib import Path

import nltk
import numpy as np
import pandas as pd
import typing_extensions as t_ext
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.utils.validation import check_is_fitted
from tqdm.notebook import tqdm

In [None]:
tqdm.pandas()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
IS_KAGGLE = False
RUN_VALIDATION = True
RUN_SUBMISSION = True

TRAIN_DATASET_ROOT = Path('/kaggle/input/jt-combined') if IS_KAGGLE \
    else Path('/home/jovyan/jigsaw-toxic/data/datasets/combined')
TEST_DATASET_ROOT = Path('/kaggle/input/jigsaw-toxic-severity-rating') if IS_KAGGLE \
    else Path('/home/jovyan/jigsaw-toxic/data/jigsaw-toxic-severity-rating')
TRAIN_CSV_PATH = TRAIN_DATASET_ROOT / 'train_comment_classification_challenge_2017.csv' 
VALID_CSV_PATH = TRAIN_DATASET_ROOT / 'valid.csv'
INFER_CSV_PATH = TEST_DATASET_ROOT / 'comments_to_score.csv'
SUBMISSION_CSV_PATH = Path('/kaggle/working/submission.csv') if IS_KAGGLE else Path('submission.csv')

WEIGHT_DICT = {
    'obscene': 0.16,
    'toxic': 0.32,
    'threat': 1.5,
    'insult': 0.64,
    'severe_toxic': 1.5,
    'identity_hate': 1.5
}
POSITIVE_FRAC = 1.0
NEGATIVE_FRAC_SCALER = 1.2

In [None]:
tokenizer = t.cast(t.Callable[[str], t.List[str]], nltk.tokenize.word_tokenize)
stop_words = stopwords.words('english')

In [None]:
def build_y(df: pd.DataFrame, weight_dict: t.Dict[str, float]) -> pd.DataFrame:
    df = df.copy()
    df['y'] = sum([df[tag] * weight for tag, weight in weight_dict.items()])
    df['y'] /= df['y'].max()  # type: ignore
    return df

In [None]:
def subsample(df: pd.DataFrame, positive_frac: float, negative_frac_scaler: float):
    return pd.concat([
        df[df.y > 0].sample(frac=positive_frac) , 
        df[df.y == 0].sample(n=int(len(df[df.y > 0]) * positive_frac * negative_frac_scaler))
    ], axis=0).sample(frac=1.0)

In [None]:
def clean_url(text: str) -> str:
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text


def clean_abbrev(text: str) -> str:
    text = re.sub(r"what's", "what is ", text)    
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'s", " ", text)
    return text


def clean_unicode(text: str) -> str:
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)', r' ', text)       
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    return text


def clean_repeat_pattern(text: str) -> str:
    text=re.sub(r'([a-zA-Z])\1{2,}\b', r'\1\1', text)
    text=re.sub(r'([a-zA-Z])\1\1{2,}\B', r'\1\1\1', text)
    text=re.sub(r'[ ]{2,}', ' ', text)
    return text


def clean_at_user(text: str) -> str:
    """ Replaces "@user" with "atUser" """
    text = re.sub('@[^\s]+','atUser',text)
    return text


def clean_multi_toxic_words(text: str) -> str:
    text = re.sub(r'(fuckfuck)','fuck fuck ',text)
    text = re.sub(r'(f+)( *)([u|*]+)( *)([c|*]+)( *)(k)+','fuck',text)
    text = re.sub(r'(haha)','ha ha ',text)
    text = re.sub(r'(s+ *h+ *i+ *t+)','shit',text)
    text = re.sub(r'([a|@][$|s][s|$])','ass',text)
    text = re.sub(r'(\bfuk\b)','fuck',text)
    return text


def clean_numbers(text: str) -> str:
    """ Removes integers """
    text = re.sub(r"(^|\W)\d+", " ", text)    
    return text


def clean_multi_punc(text: str) -> str:
    text = re.sub(r'([!?\'])\1+', r' \1\1 ', text)
    text = re.sub(r'([!?\'])', r' \1 ', text)
    text = re.sub(r'([*_:])\1+', r'\1', text)
    return text


class Lemmatizer(t_ext.Protocol):

    def lemmatize(self, word: str, pos: str = "n") -> str:
        ...


class ReplaceTokenCleaner:

    def __init__(self, token_set: t.Set[str], replace_with: str):
        self._token_set = token_set
        self._replace_with = replace_with

    def __call__(self, text: str) -> str:
        for token in self._token_set:
            text = text.replace(token, self._replace_with)
        return text


class RemoveStopWordsCleaner:

    def __init__(self, tokenizer: t.Callable[[str], t.List[str]], stop_words: t.Optional[t.List[str]] = None):
        self._tokenizer = tokenizer
        self._stop_words = stop_words if stop_words is not None else stopwords.words('english')

    def __call__(self, text: str) -> str:
        return ' '.join([token for token in self._tokenizer(text) if token not in self._stop_words])


class LemmatizeCleaner:

    def __init__(self, tokenizer: t.Callable[[str], t.List[str]], lemmatizer: Lemmatizer):
        self._tokenizer = tokenizer
        self._lemmatizer = lemmatizer

    def __call__(self, text: str) -> str:
        return ' '.join([self._lemmatizer.lemmatize(token) for token in self._tokenizer(text)])


class TextCleanerList:

    def __init__(self, cleaner_list: t.List[t.Callable[[str], str]]):
        self._cleaner_list = cleaner_list

    def __call__(self, text: str) -> str:
        for cleaner in self._cleaner_list:
            text = cleaner(text)
        return text

In [None]:
class _HandCraftedFeature:
    name: str

    def __call__(self, text: str) -> float:
        raise NotImplementedError()


class _TokenBasedHandCraftedFeature(_HandCraftedFeature):

    def __init__(self, tokenizer: t.Callable[[str], t.List[str]]):
        self._tokenizer = tokenizer

    def _tokenize(self, text: str) -> t.List[str]:
        return self._tokenizer(text)


class CharLenFeature(_HandCraftedFeature):
    name: str = 'char_len'

    def __call__(self, text: str) -> float:
        return len(text)


class TokenLenFeature(_TokenBasedHandCraftedFeature):
    name: str = 'token_len'

    def __call__(self, text: str) -> float:
        return len(self._tokenize(text))


class AvgTokenLenFeature(_TokenBasedHandCraftedFeature):
    name: str = 'avg_token_len'
    
    def __call__(self, text: str) -> float:
        return np.mean([len(token) for token in self._tokenize(text)])


class NumStopWordsFeature(_TokenBasedHandCraftedFeature):
    name: str = 'num_stop_words'

    def __init__(self, tokenizer: t.Callable[[str], t.List[str]], stop_words: t.Set[str]):
        super().__init__(tokenizer)
        self._stop_words = stop_words

    def __call__(self, text: str) -> float:
        return len([token for token in self._tokenize(text) if token.lower() in self._stop_words])


class NumWebsiteLinksFeature(_HandCraftedFeature):
    name: str = 'num_website_links'
    _RE_WEBSITE_LINK = re.compile(r'https?://\S+|www\.\S+')

    def __call__(self, text: str) -> float:
        return len(self._RE_WEBSITE_LINK.findall(text))


class NumEmojiFeature(_HandCraftedFeature):
    name: str = 'num_emoji'
    _RE_EMOJI = re.compile('['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+', flags=re.UNICODE)

    def __call__(self, text: str) -> float:
        return len(self._RE_EMOJI.findall(text))


class NumSpecialCharsFeature(_HandCraftedFeature):
    name: str = 'num_special_chars'
    _RE_SPECIAL_CHARS = re.compile(r'[^a-zA-Z\d]')

    def __call__(self, text: str) -> float:
        return len(self._RE_SPECIAL_CHARS.findall(text))


class NumExtraSpacesFeature(_HandCraftedFeature):
    name: str = 'num_extra_spaces'
    _RE_EXTRA_SPACES = re.compile(r' +')

    def __call__(self, text: str) -> float:
        return len(self._RE_EXTRA_SPACES.findall(text))


class UpperCaseCharRatioFeature(_HandCraftedFeature):
    name: str = 'upper_case_char_ratio'

    def __call__(self, text: str) -> float:
        return len([c for c in str(text) if c.isupper()]) / len(text)


class LowerCaseCharRatioFeature(_HandCraftedFeature):
    name: str = 'lower_case_char_ratio'

    def __call__(self, text: str) -> float:
        return len([c for c in str(text) if c.islower()]) / len(text)


class UpperCaseTokenRatioFeature(_TokenBasedHandCraftedFeature):
    name: str = 'upper_case_token_ratio'

    def __call__(self, text: str) -> float:
        token_list = self._tokenize(text)
        return len([token for token in token_list if token.isupper()]) / len(token_list)


class LowerCaseTokenRatioFeature(_TokenBasedHandCraftedFeature):
    name: str = 'lower_case_token_ratio'

    def __call__(self, text: str) -> float:
        token_list = self._tokenize(text)
        return len([token for token in token_list if token.islower()]) / len(token_list)


class HandCraftedFeatureList:

    def __init__(self, feature_list: t.List[_HandCraftedFeature]):
        self._feature_list = feature_list

    def __call__(self, text: str) -> np.ndarray:
        return np.array([feature(text) for feature in self._feature_list])

In [None]:
def join_features_to_sparse(array_list: t.List[t.Union[sparse.spmatrix, np.ndarray]]) -> sparse.spmatrix:
    assert len(array_list) > 0
    sparse_array_list = []
    for array in array_list:
        if isinstance(array, np.ndarray):
            array = sparse.csr_matrix(array)
        sparse_array_list.append(array)
    return sparse.hstack(sparse_array_list) if len(sparse_array_list) > 1 else sparse_array_list[0]

In [None]:
_F = t.TypeVar('_F')

class _FeatureGenerator(t.Generic[_F]):

    def __call__(self, text_list: t.List[str]) -> _F:
        raise NotImplementedError()


class TfidfFeatureGenerator(_FeatureGenerator[sparse.spmatrix]):

    def __init__(self, vectorizer: TfidfVectorizer) -> None:
        check_is_fitted(vectorizer)
        self._vectorizer = vectorizer

    def __call__(self, text_list: t.List[str]) -> sparse.spmatrix:
        return self._vectorizer.transform(text_list)


class HandCraftedFeatureGenerator(_FeatureGenerator[np.ndarray]):

    def __init__(self, feature_list: HandCraftedFeatureList, show_progress: bool = False):
        self._feature_list = feature_list
        self._show_progress = show_progress

    def __call__(self, text_list: t.List[str]) -> np.ndarray:
        return np.stack([self._feature_list(text) for text in (tqdm(text_list) if self._show_progress else text_list)], axis=0)


In [None]:
text_cleaner = TextCleanerList([
    lambda text: text.lower(),
    clean_url,
    clean_unicode,
    clean_numbers,
    clean_abbrev,
    clean_multi_toxic_words,
    clean_multi_punc,
    clean_repeat_pattern,
    ReplaceTokenCleaner(
        token_set=set('"%&\'()+,-./:;<=>@[\\]^_`{|}~'),
        replace_with=' '),
    LemmatizeCleaner(
        tokenizer=tokenizer,
        lemmatizer=WordNetLemmatizer()),
    # RemoveStopWordsCleaner(_tokenizer),
])

In [None]:
train_df = pd.read_csv(TRAIN_CSV_PATH)
train_df = build_y(train_df, weight_dict=WEIGHT_DICT)
train_df = subsample(train_df, positive_frac=POSITIVE_FRAC, negative_frac_scaler=NEGATIVE_FRAC_SCALER)

train_text_list = [str(row['comment_text']) for _, row in tqdm(train_df.iterrows(), total=len(train_df))]
train_cleaned_text_list = [text_cleaner(text) for text in tqdm(train_text_list)]

In [None]:
tfidf_feature_generator = TfidfFeatureGenerator(
    vectorizer=TfidfVectorizer(min_df=3, max_df=0.5, analyzer='char_wb', ngram_range=(3, 5)).fit(train_cleaned_text_list))
hand_crafted_feature_generator = HandCraftedFeatureGenerator(
    feature_list=HandCraftedFeatureList([
        CharLenFeature(),
        TokenLenFeature(tokenizer=tokenizer),
        AvgTokenLenFeature(tokenizer=tokenizer),
        NumStopWordsFeature(tokenizer=tokenizer, stop_words=stop_words),
        NumWebsiteLinksFeature(),
        NumEmojiFeature(),
        NumSpecialCharsFeature(),
        NumExtraSpacesFeature(),
        UpperCaseCharRatioFeature(),
        LowerCaseCharRatioFeature(),
        UpperCaseTokenRatioFeature(tokenizer=tokenizer),
        LowerCaseTokenRatioFeature(tokenizer=tokenizer),
    ]),
    show_progress=True)

In [None]:
# x = tfidf_feature_generator(train_cleaned_text_list)
x = join_features_to_sparse([
    tfidf_feature_generator(train_cleaned_text_list),
    hand_crafted_feature_generator(train_text_list),
])
y = train_df['y'].to_numpy()

In [None]:
model = Ridge(alpha=0.5)
model.fit(x, y)

In [None]:
if RUN_VALIDATION:
    valid_df = pd.read_csv(VALID_CSV_PATH)

    valid_more_text_list = [str(row['more_toxic']) for _, row in tqdm(valid_df.iterrows(), total=len(valid_df))]
    valid_less_text_list = [str(row['less_toxic']) for _, row in tqdm(valid_df.iterrows(), total=len(valid_df))]
    valid_cleaned_more_text_list = [text_cleaner(text) for text in valid_more_text_list]
    valid_cleaned_less_text_list = [text_cleaner(text) for text in valid_less_text_list]

    less_toxic_score_array = model.predict(join_features_to_sparse([
        tfidf_feature_generator(valid_cleaned_less_text_list),
        hand_crafted_feature_generator(valid_less_text_list),
    ]))
    more_toxic_score_array = model.predict(join_features_to_sparse([
        tfidf_feature_generator(valid_cleaned_more_text_list),
        hand_crafted_feature_generator(valid_more_text_list),
    ]))
    print(f'Validation accuracy: {np.mean(less_toxic_score_array < more_toxic_score_array)}')

In [None]:
if RUN_SUBMISSION:
    infer_df = pd.read_csv(INFER_CSV_PATH)

    comment_id_list, infer_text_list, infer_cleaned_text_list = [], [], []
    for _, row in tqdm(infer_df.iterrows(), total=len(infer_df)):
        comment_id, text = str(row['comment_id']), str(row['text'])
        comment_id_list.append(comment_id)
        infer_text_list.append(text)
        infer_cleaned_text_list.append(text_cleaner(text))

    score_array = model.predict(join_features_to_sparse([
        tfidf_feature_generator(infer_cleaned_text_list),
        hand_crafted_feature_generator(infer_text_list),
    ]))

    pd.DataFrame([
        {'comment_id': comment_id, 'score': score}
        for comment_id, score in zip(comment_id_list, score_array.tolist())
    ]).to_csv(SUBMISSION_CSV_PATH, index=False)

In [None]:
!head -n 11 $SUBMISSION_CSV_PATH