In [43]:
import pandas as pd
import string
import pickle
import numpy as np
import spacy
import warnings

from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [44]:
data = pd.read_csv("../data/paragraphs.csv")

In [45]:
data = data.drop("index", axis=1)

In [46]:
data["original_text"] = data.text

In [47]:
with open("../data/vectorizer096.pkl", "rb") as f:
    vectorizer: TfidfVectorizer = pickle.load(f)

with open("../data/model096.pkl", "rb") as f:
    model: CatBoostClassifier = pickle.load(f)

In [48]:
def process_text(text):
    return "".join([char for char in text.lower() if char.isalnum() or char.isspace()]).strip()

In [49]:
data["len"] = data.text.apply(len)
data["punct_count"] = data.text.apply(lambda x: len([c for c in x if c in string.punctuation]))
data["numbers_count"] = data.text.apply(lambda x: len([c for c in x if c.isdigit()]))
data["word_numbers_count"] = data.text.apply(lambda x: len([w for w in x.split() if w.isdigit()]))
data.text = data.text.apply(process_text)
data["len_after_prc"] = data.text.apply(len)

In [50]:
data_processed_tfidf = vectorizer.transform(data.text)

In [51]:
data_processed_tfidf = data_processed_tfidf.toarray()

In [52]:
columns = [
    "len",
    "punct_count",
    "numbers_count",
    "word_numbers_count",
    # ------------------
    "len_after_prc",
]

In [53]:
test = np.concatenate(
    [
        data_processed_tfidf,
        data[columns].to_numpy()
    ], axis=1)

In [54]:
labels = model.predict(test)

In [55]:
data["label"] = labels

In [56]:
data = data.loc[data.label == "other"]

In [57]:
data = data[["original_text"]]

In [58]:
data.original_text = data.original_text.apply(lambda x: x.replace("\n", " ").lower())

In [59]:
data

Unnamed: 0,original_text
0,every effort has been made to secure necessary...
1,the system of transliteration from cyrillic us...
3,the history of russia in the twentieth century...
4,great divide between soviet east and capitalis...
5,workers and a cultural straitjacket on the int...
...,...
1692,war and women’s position in soviet society wa...
1694,while the heroism of women such as the partisa...
1695,her slender neck is cut by the merciless noose...
1696,"as has been noted, it was not accidental that ..."


In [60]:

nlp = spacy.load("en_core_web_sm")

def split_into_windows(text, window_size=3, step=1):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    if len(sentences) < window_size:
        return [" ".join(sentences)]

    windows = []
    for i in range(0, len(sentences) - window_size + 1, step):
        window = " ".join(sentences[i:i + window_size])
        windows.append(window)

    return windows


In [63]:
chunk_size = [1, 2, 3, 4, 5, 6]

with tqdm(chunk_size, desc="Chunking") as pbar:
    for size in chunk_size:
        pbar.set_postfix({"size": size})
        all_chunks = []

        for text in tqdm(data["original_text"]):
            all_chunks.extend(split_into_windows(text, window_size=size))

        df_chunks = pd.DataFrame({"text": all_chunks})
        df_chunks.original_text = df_chunks.text.apply(process_text)
        df_chunks.to_csv(f"../data/chunk_{size}.csv")
        print(f"size: {size}, shape: {df_chunks.shape}")
        pbar.update(1)


Chunking:   0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

size: 1, shape: (10029, 1)


  0%|          | 0/927 [00:00<?, ?it/s]

size: 2, shape: (9135, 1)


  0%|          | 0/927 [00:00<?, ?it/s]

size: 3, shape: (8283, 1)


  0%|          | 0/927 [00:00<?, ?it/s]

size: 4, shape: (7469, 1)


  0%|          | 0/927 [00:00<?, ?it/s]

size: 5, shape: (6698, 1)


  0%|          | 0/927 [00:00<?, ?it/s]

size: 6, shape: (5970, 1)


In [67]:
with tqdm(chunk_size, desc="Processing") as pbar:
    for size in chunk_size:
        df_chunks = pd.read_csv(f"../data/chunk_{size}.csv")
        df_chunks = df_chunks.drop("Unnamed: 0", axis=1)
        df_chunks["len"] = df_chunks.text.apply(len)
        df_chunks["punct_count"] = df_chunks.text.apply(lambda x: len([c for c in x if c in string.punctuation]))
        df_chunks["numbers_count"] = df_chunks.text.apply(lambda x: len([c for c in x if c.isdigit()]))
        df_chunks["word_numbers_count"] = df_chunks.text.apply(lambda x: len([w for w in x.split() if w.isdigit()]))
        df_chunks.text = df_chunks.text.apply(process_text)
        df_chunks["len_after_prc"] = df_chunks.text.apply(len)

        data_processed_tfidf = vectorizer.transform(df_chunks.text)
        data_processed_tfidf = data_processed_tfidf.toarray()

        test = np.concatenate(
            [
                data_processed_tfidf,
                df_chunks[columns].to_numpy()
            ], axis=1)

        labels = model.predict(test)
        df_chunks["label"] = labels
        print(f"size: {size}, shape: {df_chunks.shape}")
        df_chunks = df_chunks.loc[df_chunks.label == "other"]
        print(f"size: {size}, shape: {df_chunks.shape} (after filtering)")
        df_chunks = df_chunks[["text"]]
        df_chunks.text = df_chunks.text.apply(lambda x: x.replace("\n", " ").lower())
        df_chunks.to_csv(f"../data/chunk_{size}_processed.csv", index=False)
        pbar.update(1)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

size: 1, shape: (10029, 7)
size: 1, shape: (8182, 7) (after filtering)
size: 2, shape: (9135, 7)
size: 2, shape: (8603, 7) (after filtering)
size: 3, shape: (8283, 7)
size: 3, shape: (7891, 7) (after filtering)
size: 4, shape: (7469, 7)
size: 4, shape: (7183, 7) (after filtering)
size: 5, shape: (6698, 7)
size: 5, shape: (6487, 7) (after filtering)
size: 6, shape: (5970, 7)
size: 6, shape: (5803, 7) (after filtering)
