In [None]:
!pip install spacy
!python -m spacy download pl_core_news_sm
!pip install nltk

Collecting pl-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.8.0/pl_core_news_sm-3.8.0-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pl-core-news-sm
Successfully installed pl-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [25]:
import pandas as pd
import re
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [24]:
tqdm.pandas()

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nlp = spacy.load("pl_core_news_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
train = pd.read_csv("v1_training.csv")
test = pd.read_csv("v1_test.csv")

In [None]:
train = train[['TEXT', 'GENERAL TAG']]
train = train.rename(columns={'TEXT': 'text', 'GENERAL TAG': 'label'})
test = test[['TEXT', 'GENERAL TAG']]
test = test.rename(columns={'TEXT': 'text', 'GENERAL TAG': 'label'})

In [None]:
train = train.dropna()
test = test.dropna()

train = train.drop_duplicates()
test = test.drop_duplicates()

In [None]:
train_raw = train.copy()
train_raw.to_csv('v1_training_variant1_raw.csv', index=False)
test_raw = test.copy()
test_raw.to_csv('v1_test_variant1_raw.csv', index=False)

In [None]:
def light_preprocessing(text):
    text = re.sub(r'http\S+|www\S+', '', text)

    text = re.sub(r'@[\w_]+', '', text)
    text = re.sub(r'#[\w_]+', '', text)

    text = re.sub(r'[^\w\s\.\?]', '', text)

    return text.strip()

In [None]:
train_light = train.copy()
train_light['text'] = train_light['text'].progress_apply(light_preprocessing)
train_light.to_csv('v1_training_variant2_light.csv', index=False)
test_light = test.copy()
test_light['text'] = test_light['text'].progress_apply(light_preprocessing)
test_light.to_csv('v1_test_variant2_light.csv', index=False)

100%|██████████| 10041/10041 [00:00<00:00, 118716.42it/s]
100%|██████████| 1000/1000 [00:00<00:00, 94822.96it/s]


In [None]:
def full_preprocessing(text):
    text = light_preprocessing(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('polish'))
    words = [word for word in words if word.lower() not in stop_words]

    doc = nlp(' '.join(words))
    lemmatized = [token.lemma_ for token in doc]
    return ' '.join(lemmatized)

In [34]:
train_full = train.copy()
train_full['text'] = train_full['text'].progress_apply(full_preprocessing)
train_full.to_csv('v1_training_variant3_full.csv', index=False)
test_full = test.copy()
test_full['text'] = test_full['text'].progress_apply(full_preprocessing)
test_full.to_csv('v1_test_variant3_full.csv', index=False)


100%|██████████| 10041/10041 [02:07<00:00, 78.58it/s]
100%|██████████| 1000/1000 [00:12<00:00, 80.66it/s]
