In [1]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
stopwords = stopwords.words("russian") 
specialwords = ['риа', 'новости', 'наш', 'материал', 'который', 'это', 'также', 'год', 'несколько'] 

from nltk import tokenize
tokenizer = tokenize.TweetTokenizer()

from string import punctuation

from pymorphy3 import MorphAnalyzer
ma = MorphAnalyzer()

import warnings
warnings.filterwarnings('ignore')

In [3]:
from datetime import datetime, timezone, timedelta
def date_from_unix (date: int): 
    '''перевод даты в привычный формат'''
    return datetime.fromtimestamp(date).strftime('%Y-%m-%d')

In [5]:
files = ['1january', '2february', '3march', '4april', '5may', '6june', '7july', 
         '8august', '9september', '10october', '11november', '12december']

In [7]:
data = pd.read_excel('1january.xlsx')
for i in files[1:]:
    file = i + '.xlsx'
    df = pd.read_excel(file)
    data = pd.concat([df, data])

In [9]:
data = data[['id', 'Posts', 'Date']].reset_index(drop=True)

In [11]:
data['Date'] = data['Date'].apply(lambda x: date_from_unix(x))

In [15]:
data.isna().any()

id       False
Posts    False
Date     False
dtype: bool

In [19]:
data.to_excel('all_posts_raw.xlsx', index = False)

# Подготовка данных

In [21]:
def is_cyrillic(text):
    return bool(re.fullmatch(r'^[а-яА-ЯёЁ]+$', text))

In [25]:
def preprocessing (x: str) -> str:
    x = x.lower()
    # Удаления:
    x = re.sub(r'\[.*?\,', '', x) #пользователи
    x = re.sub(r'[\t\n]', '', x) #знаки переноса строки и отступа
    x = re.sub('\d+', '', x) #цифры
    x = re.sub(r'[^\w\s]', '', x) #пунктуация
    x = re.sub('ё', 'е', x)
    
    x = tokenizer.tokenize(x) #токенизация
    x = [word for word in x if word not in stopwords] #удаление стоп слов
    x = [word for word in x if len(word) > 2] #удаление всяких огрызков

    exclude = ['бпла', 'небензя', 'цахал']
    x = [word if word in exclude else ma.parse(word)[0].normal_form for word in x if is_cyrillic(word)] #лемматизация
    x = [word for word in x if word not in specialwords]
    x = ' '.join(x)

    if len(x) == 0:
        x = 'svin'
    return x

In [27]:
%%time
texts_prepr = [preprocessing(text) for text in data['Posts'].tolist()]

CPU times: total: 1min 53s
Wall time: 1min 57s


In [29]:
len(texts_prepr)

15535

In [35]:
replacing = [i.replace('специальный военный операция', 'сво') for i in texts_prepr]

In [37]:
data['Tokens'] = replacing

In [43]:
data.query('Tokens == "svin"') #удалим, пост про смерть Кавалли исследование не разрушит

Unnamed: 0,id,Posts,Date,Tokens
11522,45293045,Ему было 83 года.,2024-04-12,svin


In [45]:
data = data.drop(11522)

In [51]:
data.isna().any()

id        False
Posts     False
Date      False
Tokens    False
dtype: bool

In [53]:
data.to_excel('all_posts_preprocessed.xlsx', index = False)