<a href="https://www.kaggle.com/code/akscent/feature-extraction-train?scriptVersionId=150693529" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install pymorphy2 cleantext -U pip setuptools wheel nlp_profiler textblob pymystem3 > installer_log.txt
!pip install spacy > installer_log.txt
import os
import sys
import torch
import json
import spacy
import io
import ru_core_news_md
import shap
shap.initjs()
import pandas as pd
import numpy as np

from numpy import asarray
from collections import Counter
from typing import Dict
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModel, MBartTokenizer, MBartForConditionalGeneration, BertTokenizer, BertForSequenceClassification
from textblob import TextBlob
from nlp_profiler.core import apply_text_profiling
from pymystem3 import Mystem
from nltk.corpus import stopwords
from catboost import CatBoostClassifier


# Data load

In [None]:
PATH = "/kaggle/input/ods-huawei/"
train_data = pd.read_csv(os.path.join(PATH, "train.csv"))
test_data = pd.read_csv(os.path.join(PATH, "test.csv"))
le = LabelEncoder()
train_data.rate = le.fit_transform(train_data.rate)
train_data.head()

# Clean text

In [None]:
ru_stopwords = stopwords.words('russian')
digits = [str(i) for i in range(10)]

TOKEN_RE = re.compile(r'[а-яё!.,?%]+')
lemmatizer = pymorphy2.MorphAnalyzer()

def is_valid_word(word):
    if not word[0].isdigit() and word not in ru_stopwords:
        parsed_word = lemmatizer.normal_forms(word)[0]
        return parsed_word
    return False

def text_cleaning(text):
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    words = text.split()
    cleaned_words = [word for word in words[:512] if is_valid_word(word) and len(word) < 15]
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

tqdm.pandas()
train_data['text'] = train_data['text'].progress_apply(text_cleaning)
test_data['text'] = test_data['text'].progress_apply(text_cleaning)

train_data["num_words"] = train_data["text"].apply(
    lambda x: len(str(x).split()))
test_data["num_words"] = test_data["text"].apply(
    lambda x: len(str(x).split()))

In [None]:
# del zero string
train_data = train_data[train_data['num_words'] != 0]
test_data = test_data[test_data['num_words'] != 0]

In [None]:
# clean fucking words

def remove_infrequent_words(dataset, min_count=3):
    word_counter = Counter()
    for text in dataset:
        words = text.split()
        word_counter.update(words)
    infrequent_words = [word for word, count in word_counter.items() if count < min_count]
    def remove_infrequent(text):
        words = text.split()
        cleaned_words = [word for word in words if word not in infrequent_words]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    cleaned_dataset = [remove_infrequent(text) for text in tqdm(dataset, desc="Cleaning text")]

    return cleaned_dataset

train_data = remove_infrequent_words(train_data['text'].tolist())
test_data = remove_infrequent_words(test_data['text'].tolist())


In [None]:
# replace nan

def replace_nan_with_text(row):
    if pd.isna(row['cleaned_text']):
        return row['text']
    return row['cleaned_text']

train_data['cleaned_text'] = train_data.progress_apply(replace_nan_with_text, axis=1)
test_data['cleaned_text'] = test_data.progress_apply(replace_nan_with_text, axis=1)

In [None]:
def truncate_text(text, max_words=512):
    words = text.split()
    if len(words) > max_words:
        truncated_text = ' '.join(words[:max_words])
    else:
        truncated_text = text
    return truncated_text

tqdm.pandas()
train_data['cleaned_text'] = train_data['cleaned_text'].progress_apply(truncate_text)
test_data['cleaned_text'] = test_data['cleaned_text'].progress_apply(truncate_text)

# Text summarizer

In [None]:
# идея суммирования текста в более короткий текст

model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

def summary_rows(article_text):
    input_ids = tokenizer(
        [article_text],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

def text_summary(text):
    if isinstance(text, str) and text.strip() and len(str(text).split()) > 150:
        return summary_rows(text)
    else:
        return text
    

train_data['summary'] = train_data['cleaned_text'].progress_apply(text_summary)
test_data['summary'] = test_data['cleaned_text'].progress_apply(text_summary)

# Feature generation

## What industry?

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("apanc/russian-sensitive-topics")
tokenizer = AutoTokenizer.from_pretrained("apanc/russian-sensitive-topics")
tokenizer.padding = True
tokenizer.truncation = True
tokenizer.max_length = 512
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=torch.device("cuda:0"))

def make_pipe(text):
    return pipe(text, return_all_scores=True)

tqdm.pandas()
train_data['theme_labels'] = train_data['summary'].progress_apply(make_pipe)

def extract_label_probs(row):
    label_probs = [label['score'] for label in row[0]]
    return label_probs

train_data['label_probs'] = train_data['theme_labels'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].apply(pd.Series).add_prefix('LABEL_')], axis=1)

del train_data['label_probs']
del train_data['theme_labels']

## Text tone

In [None]:
# тональность текста
pipe = pipeline(model="seara/rubert-tiny2-russian-sentiment", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['mood'] = train_data['summary'].progress_apply(make_pipe)

train_data['label_probs'] = train_data['mood'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('MOOD_')], axis=1)

del train_data['label_probs']
del train_data['mood']

## Text toxicity

In [None]:
# токичность

pipe = pipeline(model="SkolkovoInstitute/russian_toxicity_classifier", device=torch.device("cuda:0"))

tqdm.pandas()
train_data['toxic'] = train_data['summary'].progress_apply(make_pipe)

train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('TOXIC_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

## Emotions

In [None]:
# эмоции

LABELS = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

@torch.no_grad()
def predict_emotion(text: str) -> str:
    """
        We take the input text, tokenize it, pass it through the model, and then return the predicted label
        :param text: The text to be classified
        :type text: str
        :return: The predicted emotion
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
        
    return LABELS[predicted[0]]

@torch.no_grad()    
def predict_emotions(text: str) -> list:
    """
        It takes a string of text, tokenizes it, feeds it to the model, and returns a dictionary of emotions and their
        probabilities
        :param text: The text you want to classify
        :type text: str
        :return: A dictionary of emotions and their probabilities.
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    emotions_list = {}
    for i in range(len(predicted.numpy()[0].tolist())):
        emotions_list[LABELS[i]] = predicted.numpy()[0].tolist()[i]
    return emotions_list

train_data['toxic'] = train_data['summary'].progress_apply(predict_emotions)

def extract_label_probs(row):
    label_probs = [row.get(label, 0.0) for label in LABELS]
    return label_probs

train_data['label_probs'] = train_data['toxic'].apply(extract_label_probs)

train_data = pd.concat([train_data, train_data['label_probs'].progress_apply(pd.Series).add_prefix('EMOTION_')], axis=1)

del train_data['label_probs']
del train_data['toxic']

In [None]:
train_data["num_words_sum"] = train_data["summary"].apply(
    lambda x: len(str(x).split()))

## Other nlp imports

In [None]:
# ! textblob - обработка текста, генерация фич https://textblob.readthedocs.io/en/dev/quickstart.html - ничего интересного
# ! еще одна библиотека для классификации текстов https://small-text.readthedocs.io/en/latest/ не подходит? для малкеньких текстов
# ! полярность слов https://polyglot.readthedocs.io/en/latest/ - тоже? что уже получил из предобученных моделей
# ! обработка фич https://github.com/jbesomi/texthero - плохо поддерживается
# фичегенерация https://github.com/neomatrix369/nlp_profiler#Notebooks
# классификация на других предобученных моделях, перечисленных у Алерона https://github.com/a-milenkin/Competitive_Data_Science/blob/main/notebooks/9.2.1%20-%20Text_Embeddings.ipynb
# использовать эти ноутбуки для классификации https://github.com/e0xextazy/vkcup2022-first-stage/blob/main/inference.ipynb

# NLP profiler

In [None]:
profiled_text_dataframe = apply_text_profiling(train_data, 'text')

# SpaCy

In [None]:
tqdm.pandas()
train_data["num_unique_words"] = train_data["text"].progress_apply(lambda x: len(set(str(x).split())))
train_data["num_stopwords"] = train_data["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

In [None]:
# Features of POS\o

class TextPOSAnalysis:
    def __init__(self):
        self.nlp_ru = ru_core_news_md.load()
        self.df_pos = self.load_pos_table()
        self.m = Mystem()

    def load_pos_table(self):
        table = """
        A       ADJ
        ADV     ADV
        ADVPRO  ADV
        ANUM    ADJ
        APRO    DET
        COM     ADJ
        CONJ    SCONJ
        INTJ    INTJ
        NONLEX  X
        NUM     NUM
        PART    PART
        PR      ADP
        S       NOUN
        SPRO    PRON
        UNKN    X
        V       VERB
        """
        table_file = io.StringIO(table)
        df = pd.read_csv(table_file, sep="\s+", header=None, names=["token", "universal_pos"])
        return df

    def get_universal_tag(self, word):
        processed = self.m.analyze(word)[0]
        lemma = processed["analysis"][0]["lex"].lower().strip()
        pos = processed["analysis"][0]["gr"].split(',')[0]
        pos = pos.split('=')[0].strip()
        tagged = lemma + '_' + pos
        return tagged

    def add_tag(self, word):
        word = self.get_universal_tag(word)
        tag = word.split('_')[1]
        tag = self.df_pos[self.df_pos['token'] == tag]['universal_pos'].values[0] if tag in self.df_pos['token'].values else tag
        word = word.split('_')[0] + '_' + tag
        return word

    def analyze_text(self, text):
        doc = self.nlp_ru(text)
        num_adj = len([tok for tok in doc if tok.pos_ == 'ADJ'])
        num_adv = len([tok for tok in doc if tok.pos_ == 'ADV'])
        num_noun = len([tok for tok in doc if tok.pos_ == 'NOUN'])
        num_verb = len([tok for tok in doc if tok.pos_ == 'VERB'])
        return num_adj, num_noun, num_verb, num_adv

    def analyze_texts(self, texts):
        results = []
        for text in texts:
            results.append(self.analyze_text(text))
        return pd.DataFrame(results, columns=["Num_ADJ", "Num_ADV", "Num_NOUN", "Num_VERB"])


text_POS = TextPOSAnalysis()
POS_results = text_POS.analyze_texts(train_data['text'])
train_data = pd.concat([train_data, POS_results], axis=1)


# Feature Selection

## Shap

In [None]:
train_split, val_split = train_test_split(train_data, test_size=0.15, random_state=42, 
                                          shuffle = True, stratify=train_data['rate'])
train_X = train_split.iloc[:, 5:441] 
train_Y = train_split['rate']

val_X = val_split.iloc[:, 5:441] 
val_Y = val_split['rate']

del_columns = ['text', 'sentiment_polarity',
       'sentiment_polarity_summarised',
       'sentiment_subjectivity', 'sentiment_subjectivity_summarised',
       'spelling_quality',
       'spelling_quality_summarised']

train_X = train_X.drop(columns = del_columns)
val_X = val_X.drop(columns = del_columns)

In [None]:
clf = CatBoostClassifier(random_seed=9,
                        thread_count=-1,
                        use_best_model=True,
                        bootstrap_type='Bernoulli')

clf.fit(train_X, train_Y,
        eval_set=(val_X, val_Y),
        verbose=100,
        plot=True,
        early_stopping_rounds=1000)

print(clf.get_best_score())

In [None]:
fi = clf.get_feature_importance(prettified=True)[:100]

In [None]:
train_X = train_X[fi['Feature Id'].to_list()]
val_X = val_X[fi['Feature Id'].to_list()]

## Рекурсивный feature_selection Catboost

In [None]:
summary = clf.select_features(train_X, train_Y, 
                      eval_set=(val_X, val_Y),
                      features_for_select='0-99',
                      num_features_to_select=50,
                      steps=1,
                      train_final_model=False,
                      logging_level='Silent')

# Save new_train

In [None]:
summary['selected_features_names'].extend(['text', 'summary'])

In [None]:
new_train = train_data[summary['selected_features_names']]
new_train.to_csv("new_train.csv", index=False)