In [None]:
!pip install catboost
!spacy download en_core_web_trf

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_trans

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import string
import pickle
import numpy as np
import spacy
import warnings

from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm

spacy.prefer_gpu()
tqdm.pandas()
warnings.filterwarnings("ignore")

In [None]:
V = 4

In [None]:
if not os.path.exists("./drive/MyDrive/sae_data"):
    os.makedirs("./drive/MyDrive/sae_data")
if not os.path.exists(f"./drive/MyDrive/sae_data/chunks{V}"):
    os.makedirs(f"./drive/MyDrive/sae_data/chunks{V}")

In [None]:
data = pd.read_csv(f"./drive/MyDrive/paragraphsv{V}.csv")

In [None]:
data = data.drop("index", axis=1)

In [None]:
data["original_text"] = data.text

In [None]:
with open("./vectorizer096.pkl", "rb") as f:
    vectorizer: TfidfVectorizer = pickle.load(f)

with open("./model096.pkl", "rb") as f:
    model: CatBoostClassifier = pickle.load(f)

In [None]:
def process_text(text):
    return "".join([char for char in text.lower() if char.isalnum() or char.isspace()]).strip()

In [None]:
data["len"] = data.text.apply(len)
data["punct_count"] = data.text.apply(lambda x: len([c for c in x if c in string.punctuation]))
data["numbers_count"] = data.text.apply(lambda x: len([c for c in x if c.isdigit()]))
data["word_numbers_count"] = data.text.apply(lambda x: len([w for w in x.split() if w.isdigit()]))
data.text = data.text.apply(process_text)
data["len_after_prc"] = data.text.apply(len)

In [None]:
data_processed_tfidf = vectorizer.transform(data.text)

In [None]:
data_processed_tfidf = data_processed_tfidf.toarray()

In [None]:
columns = [
    "len",
    "punct_count",
    "numbers_count",
    "word_numbers_count",
    # ------------------
    "len_after_prc",
]

In [None]:
test = np.concatenate(
    [
        data_processed_tfidf,
        data[columns].to_numpy()
    ], axis=1)

In [None]:
labels = model.predict(test)

In [None]:
data["label"] = labels

In [None]:
data.loc[data.label == "other"].shape

(3996, 8)

In [None]:
data.loc[data.label != "other"]

Unnamed: 0,text,original_text,len,punct_count,numbers_count,word_numbers_count,len_after_prc,label
2,introduction\n rona l d g r i g o r s u n y,Introduction\n rona l d g r i g o r s u n y,42,0,0,0,42,author
13,1 christopher lasch the american liberals and ...,"1 Christopher Lasch, The American Liberals and...",258,13,10,1,245,author
14,reading russia and the soviet union in the twe...,Reading Russia and the Soviet Union in the twe...,60,0,0,0,60,author
16,2 marquis de custine journey for our time the ...,"2 Marquis de Custine, Journey for Our Time: Th...",1024,80,58,7,941,author
18,4 ibid p 37\n 5 on american views of russia an...,"4 Ibid., p. 37.\n 5 On American views of Russi...",1460,103,89,21,1351,author
...,...,...,...,...,...,...,...,...
5116,372 this was the basis for maiskys notion of t...,372 This was the basis for Maisky’s notion of ...,183,3,21,1,173,author
5117,373 moskva i vostochnaya evropastanovlenie pol...,373 Moskva i vostochnaya Evropa:Stanovlenie po...,195,10,19,1,174,author
5145,379 see vostochnaya evropa v dokumentakh rossi...,379 See Vostochnaya Evropa v dokumentakh rossi...,150,9,21,1,136,author
5146,380 rossiya i sssr v voynah xx veka pp 245246 ...,"380 Rossiya i SSSR v voynah XX veka, pp� 245–2...",108,9,12,1,95,author


In [None]:
data = data.loc[data.label == "other"]

In [None]:
data = data[["original_text"]]

In [None]:
data.original_text = data.original_text.apply(lambda x: x.replace("\n", " ").lower())

In [None]:
data

Unnamed: 0,original_text
0,every effort has been made to secure necessary...
1,the system of transliteration from cyrillic us...
3,the history of russia in the twentieth century...
4,great divide between soviet east and capitalis...
5,workers and a cultural straitjacket on the int...
...,...
5160,"nonetheless, there are still marked difference..."
5161,something that also needs to be said is that t...
5162,very significant was also the nationwide effor...
5163,similarly to the weight of the early “socialis...


In [None]:
nlp = spacy.load("en_core_web_trf")

def split_into_windows(text, window_size=3, step=1):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    if len(sentences) < window_size:
        return [" ".join(sentences)]

    windows = []
    for i in range(0, len(sentences) - window_size + 1, step):
        window = " ".join(sentences[i:i + window_size])
        windows.append(window)

    return windows


In [None]:
chunk_size = [1, 2, 3, 4]

with tqdm(chunk_size, desc="Chunking") as pbar:
    for size in chunk_size:
        pbar.set_postfix({"size": size})
        all_chunks = []

        for text in tqdm(data["original_text"]):
            all_chunks.extend(split_into_windows(text, window_size=size))

        df_chunks = pd.DataFrame({"text": all_chunks})
        df_chunks.original_text = df_chunks.text.apply(process_text)
        df_chunks.to_csv(f"./drive/MyDrive/sae_data/chunks{V}/chunk_{size}.csv")
        print(f"size: {size}, shape: {df_chunks.shape}")
        pbar.update(1)


Chunking:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3996 [00:00<?, ?it/s]

size: 1, shape: (23583, 1)


  0%|          | 0/3996 [00:00<?, ?it/s]

size: 2, shape: (20363, 1)


  0%|          | 0/3996 [00:00<?, ?it/s]

size: 3, shape: (17716, 1)


  0%|          | 0/3996 [00:00<?, ?it/s]

size: 4, shape: (15539, 1)


In [None]:
with tqdm(chunk_size, desc="Processing") as pbar:
    for size in chunk_size:
        df_chunks = pd.read_csv(f"./drive/MyDrive/sae_data/chunks{V}/chunk_{size}.csv")
        df_chunks = df_chunks.drop("Unnamed: 0", axis=1)
        df_chunks["len"] = df_chunks.text.apply(len)
        df_chunks["punct_count"] = df_chunks.text.apply(lambda x: len([c for c in x if c in string.punctuation]))
        df_chunks["numbers_count"] = df_chunks.text.apply(lambda x: len([c for c in x if c.isdigit()]))
        df_chunks["word_numbers_count"] = df_chunks.text.apply(lambda x: len([w for w in x.split() if w.isdigit()]))
        df_chunks.text = df_chunks.text.apply(process_text)
        df_chunks["len_after_prc"] = df_chunks.text.apply(len)

        data_processed_tfidf = vectorizer.transform(df_chunks.text)
        data_processed_tfidf = data_processed_tfidf.toarray()

        test = np.concatenate(
            [
                data_processed_tfidf,
                df_chunks[columns].to_numpy()
            ], axis=1)

        labels = model.predict(test)
        df_chunks["label"] = labels
        print(f"size: {size}, shape: {df_chunks.shape}")
        df_chunks = df_chunks.loc[df_chunks.label == "other"]
        print(f"size: {size}, shape: {df_chunks.shape} (after filtering)")
        df_chunks = df_chunks[["text"]]
        df_chunks.text = df_chunks.text.apply(lambda x: x.replace("\n", " ").lower())
        df_chunks.to_csv(f"./drive/MyDrive/sae_data/chunks{V}/chunk_{size}_processed.csv", index=False)
        pbar.update(1)

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

size: 1, shape: (23583, 7)
size: 1, shape: (18423, 7) (after filtering)
size: 2, shape: (20363, 7)
size: 2, shape: (19415, 7) (after filtering)
size: 3, shape: (17716, 7)
size: 3, shape: (17206, 7) (after filtering)
size: 4, shape: (15539, 7)
size: 4, shape: (15183, 7) (after filtering)
