In [None]:
%%capture
!pip install gdown
!pip install --user matplotlib
!pip show wordcloud
!pip install requests
!pip install pymystem3
!pip install nltk
!pip install pandas
!pip install Counter
!pip install sklearn
!pip install pyLDAvis
!pip install --upgrade numpy scipy scikit-learn
!pip install scipy 
!pip install spacy
!pip install pyLDAvis
!pip install pandas

In [None]:
%%capture
import sys
import re
import scipy
import pyLDAvis
import sklearn
import gdown
import requests
import nltk
import pandas as pd
import pymystem3
import spacy
import math
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
from nltk import bigrams, trigrams
from wordcloud import WordCloud
from nltk.corpus import stopwords
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [None]:
import gdown
import os
import pandas as pd
os.makedirs("data", exist_ok=True)
file_id = "1AdFQ-uPMIAF8RK1-HuRJnO511P-Ac_6N"  
url = f"https://drive.google.com/uc?id={file_id}"
output = "data/all_cases_unmarked.csv"
gdown.download(url, output, quiet=False)
df = pd.read_csv(output)
df.head(5)

Очищаем и лемматизируем текст

In [None]:
nltk.download("stopwords")
russian_stopwords = set(stopwords.words("russian"))
custom_stopwords = [
    'фио', 'гггг', 'подсудимый', 'суд',
    'изымать', 'согласно', 'наказание',
    'потерпевший', 'показание', 'судебный',
    'преступление', 'адрес', 'свидетель',
    'свой', 'находиться', 'час', 'ход',
    'дело', 'российский федерация'
             ]
all_stopwords = {word.lower() for word in russian_stopwords.union(custom_stopwords)}

mystem = Mystem()

def clean_text(text):
    if pd.isna(text):
        return ""
   
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    # Лемматизация
    words = text.split()
    lemmatized_words = [mystem.lemmatize(word)[0] for word in words]
    # Удаление стоп-слов
    filtered_words = [word for word in lemmatized_words if word not in all_stopwords and word.strip()]
    return " ".join(filtered_words)

df["cleaned_text"] = df["text"].apply(clean_text)
cleaned_df = df[["cleaned_text"]].copy()
print(cleaned_df.head())


In [None]:
full_text = " ".join(cleaned_df["cleaned_text"])

Считаем слова, символы и токены в целом и в среднем на один приговор

In [None]:
from nltk.tokenize import word_tokenize

word_counts = [len(word_tokenize(text)) for text in cleaned_df["cleaned_text"]]
avg_words = sum(word_counts) / len(word_counts) if word_counts else 0
median_words = sorted(word_counts)[len(word_counts) // 2] if word_counts else 0

print("Среднее количество слов:", avg_words)
print("Медианное количество слов:", median_words)

In [None]:
from nltk.tokenize import word_tokenize

unique_token_counts = [len(set(word_tokenize(text))) for text in cleaned_df["cleaned_text"]]

avg_unique_tokens = sum(unique_token_counts) / len(unique_token_counts) if unique_token_counts else 0

sorted_tokens = sorted(unique_token_counts)
median_unique_tokens = sorted_tokens[len(sorted_tokens) // 2] if unique_token_counts else 0

print("Среднее количество уникальных токенов:", avg_unique_tokens)
print("Медианное количество уникальных токенов:", median_unique_tokens)

In [None]:
char_counts = [len(text) for text in cleaned_df["cleaned_text"]]

avg_chars = sum(char_counts) / len(char_counts) if char_counts else 0

sorted_chars = sorted(char_counts)
median_chars = sorted_chars[len(sorted_chars) // 2] if char_counts else 0

print("Среднее количество символов:", avg_chars)
print("Медианное количество символов:", median_chars)

In [None]:
def lexical_measures(text):
    tokens = word_tokenize(text.lower())
    types = set(tokens)
    N = len(tokens)
    V = len(types)
    
    if N == 0:
        return {'TTR': 0, 'RTTR': 0, 'CTTR': 0, 'Herdan_C': 0}
    
    try:
        herdan_c = math.log(V) / math.log(N) if V > 1 and N > 1 else 0
    except ZeroDivisionError:
        herdan_c = 0

    return {
        'TTR': V / N,
        'RTTR': V / math.sqrt(N),
        'CTTR': V / math.sqrt(2 * N),
        'Herdan_C': herdan_c
    }
cleaned_df = cleaned_df[cleaned_df["cleaned_text"].str.strip().astype(bool)]

metrics = cleaned_df['cleaned_text'].apply(lexical_measures)
metrics_df = pd.DataFrame(metrics.tolist())
cleaned_df_with_metrics = pd.concat([cleaned_df, metrics_df], axis=1)

In [None]:
#Усредняем по датафрейму
avg_metrics = cleaned_df_with_metrics[['TTR', 'RTTR', 'CTTR', 'Herdan_C']].mean()

print("Средние значения по корпусу:")
print(avg_metrics)

Сохраняем в csv

In [None]:
cleaned_df.to_csv('cleaned_df.csv', index=False)