# Imports

In [25]:
import pandas as pd

In [44]:
import re
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score

In [1]:
%%capture
!pip install gdown
!pip install pymystem3
!pip install stop-words

In [29]:
import pymystem3
import stop_words

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
from sklearn.tree import DecisionTreeClassifier

In [51]:
from sklearn.utils import resample

# Global variables

In [30]:
mstem = pymystem3.Mystem()

In [None]:
import gdown
import os
import pandas as pd
os.makedirs("data", exist_ok=True)
file_id = "1_4jFNzQYvnMvcQEt0mepxK_muMiKGv3p"  
url = f"https://drive.google.com/uc?id={file_id}"
output = "data/marked_sentences_472.csv"
gdown.download(url, output, quiet=False)
slavery = pd.read_csv(output)
slavery.head(5)

In [34]:
slavery['text'] = slavery['text'].astype(str)
slavery['Slavery_1_step'] = pd.to_numeric(slavery['Slavery_1_step'], errors='coerce').fillna(0).astype(int)
slavery['Slavery_2_expert'] = pd.to_numeric(slavery['Slavery_2_expert'], errors='coerce').fillna(0).astype(int)

In [35]:
#55/87
patterns_set_1 = [
    r'\bкабальные\b',
    r'\bотрабатывать\b|\bотработать\b|\bотработав\b',
    r'не оставля(ла|ли|ть)? без (присмотра|наблюдения)',
    r'\bрабств(о|а|е)?\b'
]

patterns_first_set = [
    r'\bпродолж(ение|ению|ать|ить)? заняти(я|ю|е)? проституцией\b',
    r'\bпродолжить\b оказывать сексуальные услуги',
    r'\bза отказ\b продолжать заниматься проституцией',
    r'\bотрабатывать\b|\bотработать\b|\bотработав\b'
]

patterns_second_set = [
    r'\bнасили(е|я|ю)?\b',
    r'\bштраф(ы|ов|овать)?\b',
    r'\bугро(зы|жать|жал|жала|жали)?\b',
    r'\bизби(л|вать|ение)?\b'
]

compiled_patterns_set_1 = [re.compile(pattern, re.IGNORECASE) for pattern in patterns_set_1]
compiled_patterns_first_set = [re.compile(pattern, re.IGNORECASE) for pattern in patterns_first_set]
compiled_patterns_second_set = [re.compile(pattern, re.IGNORECASE) for pattern in patterns_second_set]

In [36]:
stopwords = stop_words.get_stop_words('russian')
stopwords.extend(stop_words.get_stop_words('english'))
stopwords = list(set(stopwords))
stopwords += [
    'фио', 'гггг', 'подсудимый', 'суд',
    'изымать', 'согласно', 'наказание',
    'потерпевший', 'показание', 'судебный',
    'преступление', 'адрес', 'свидетель',
    'свой', 'находиться', 'час', 'ход',
    'дело', 'российский федерация'
             ]

# Classifier

In [14]:
# Classifier on patterns
def classify_text(text, patterns_set_1, patterns_first_set, patterns_second_set):

    found_set_1 = any(pattern.search(text) for pattern in patterns_set_1)
    found_first_set = any(pattern.search(text) for pattern in patterns_first_set)
    found_second_set = any(pattern.search(text) for pattern in patterns_second_set)

    if found_set_1 or (found_first_set and found_second_set):
        return 1
    return 0

In [23]:
# Применение классификатора к DataFrame
slavery['Predicted_Slavery'] = slavery['text'].apply(lambda x: classify_text(x, compiled_patterns_set_1, compiled_patterns_first_set, compiled_patterns_second_set))

In [24]:
# Оценка качества классификации
print("ROC AUC:", roc_auc_score(slavery['Slavery_1_step'], slavery['Predicted_Slavery']))
print("Classification Report - slav_1:\n",
      classification_report(slavery['Slavery_1_step'], slavery['Predicted_Slavery']),
      confusion_matrix(slavery['Slavery_1_step'], slavery['Predicted_Slavery']))
print("Classification Report - slav_2:\n",
      classification_report(slavery['Slavery_2_expert'], slavery['Predicted_Slavery']),
      confusion_matrix(slavery['Slavery_2_expert'], slavery['Predicted_Slavery']))

ROC AUC: 0.9326605602300564
Classification Report - slav_1:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98       458
           1       0.69      0.90      0.78        41

    accuracy                           0.96       499
   macro avg       0.84      0.93      0.88       499
weighted avg       0.97      0.96      0.96       499
 [[441  17]
 [  4  37]]
Classification Report - slav_2:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95       489
           1       0.15      0.80      0.25        10

    accuracy                           0.90       499
   macro avg       0.57      0.85      0.60       499
weighted avg       0.98      0.90      0.93       499
 [[443  46]
 [  2   8]]


In [37]:
# Classifier funs
def keep_only_rus(text):
    new_text = ''
    for symbol in text:
        if re.match(r'[А-я]', symbol) or symbol == ' ':
            new_text += symbol
        else:
            new_text += ' '
    return new_text

def del_double_spaces(text_with_double_spaces):
    while '  ' in text_with_double_spaces:
        text_with_double_spaces = text_with_double_spaces.replace('  ',' ')
    return text_with_double_spaces

def lemmatize(raw_text):
    return ''.join(mstem.lemmatize(raw_text)).strip()

def del_stopwords(text):
    global stopwords
    new_text = []
    for word in text.split():
        if word not in stopwords and len(word) > 2:
            new_text.append(word)
    return ' '.join(new_text)

In [38]:
slavery['cleaned_text'] = slavery['text'].str.lower()
slavery['cleaned_text'] = slavery['cleaned_text'].apply(keep_only_rus)
slavery['cleaned_text'] = slavery['cleaned_text'].apply(del_double_spaces)
# Эта колонка долго генерится, ее лучше сохранить как отдельный цсв файл и добавлять как колонку к slavery
slavery['cleaned_text'] = slavery['cleaned_text'].apply(lemmatize)
slavery['cleaned_text'] = slavery['cleaned_text'].apply(del_stopwords)
slavery['cleaned_text'] = slavery['cleaned_text'].apply(del_double_spaces)
slavery = slavery[slavery['cleaned_text'].notna()]

In [39]:
slavery['cleaned_text']

Unnamed: 0,cleaned_text
0,ифио август судья советский районный рсо алани...
1,приговор российский федерация дубна сентябрь д...
2,российский федерация автозаводский районный пр...
3,российский федерация саранск республика мордов...
4,приговор российский федерация июль тверь тверс...
...,...
494,текст приговор российский федерация петербург ...
495,текст приговор приговор российский федерация а...
496,текст приговор российский федерация сентябрь г...
497,текст приговор приговор российский федерация н...


In [48]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_1_step']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))
# Плохой результат.

Accuracy: 0.95
ROC AUC: 0.9219858156028369
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        94
           1       1.00      0.17      0.29         6

    accuracy                           0.95       100
   macro avg       0.97      0.58      0.63       100
weighted avg       0.95      0.95      0.93       100
 [[94  0]
 [ 5  1]]


In [50]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_2_expert']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))
# Пишет, что выборка маленькая. Результат плохой.

Accuracy: 0.98
ROC AUC: 0.7193877551020408
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        98
           1       0.00      0.00      0.00         2

    accuracy                           0.98       100
   macro avg       0.49      0.50      0.49       100
weighted avg       0.96      0.98      0.97       100
 [[98  0]
 [ 2  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
slavery_majority = slavery[slavery['Slavery_1_step'] == 0]
slavery_minority = slavery[slavery['Slavery_1_step'] == 1]

slavery_minority_upsampled = resample(slavery_minority,
                                      replace=True,
                                      n_samples=len(slavery_majority),
                                      random_state=42)

slavery_upsampled = pd.concat([slavery_majority, slavery_minority_upsampled])

vectorizer = TfidfVectorizer()
X_upsampled = vectorizer.fit_transform(slavery_upsampled['cleaned_text'])
y_upsampled = slavery_upsampled['Slavery_1_step']

X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 0.9945652173913043
ROC AUC: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        96
           1       0.99      1.00      0.99        88

    accuracy                           0.99       184
   macro avg       0.99      0.99      0.99       184
weighted avg       0.99      0.99      0.99       184
 [[95  1]
 [ 0 88]]


In [53]:
slavery_majority = slavery[slavery['Slavery_2_expert'] == 0]
slavery_minority = slavery[slavery['Slavery_2_expert'] == 1]

slavery_minority_upsampled = resample(slavery_minority,
                                      replace=True,
                                      n_samples=len(slavery_majority),
                                      random_state=42)

slavery_upsampled = pd.concat([slavery_majority, slavery_minority_upsampled])

vectorizer = TfidfVectorizer()
X_upsampled = vectorizer.fit_transform(slavery_upsampled['cleaned_text'])
y_upsampled = slavery_upsampled['Slavery_2_expert']

X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 1.0
ROC AUC: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        95
           1       1.00      1.00      1.00       101

    accuracy                           1.00       196
   macro avg       1.00      1.00      1.00       196
weighted avg       1.00      1.00      1.00       196
 [[ 95   0]
 [  0 101]]


In [57]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_1_step']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)

y_pred = gb_classifier.predict(X_test)
y_prob = gb_classifier.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 0.99
ROC AUC: 0.8874113475177305
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99        94
           1       1.00      0.83      0.91         6

    accuracy                           0.99       100
   macro avg       0.99      0.92      0.95       100
weighted avg       0.99      0.99      0.99       100
 [[94  0]
 [ 1  5]]


In [58]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_2_expert']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)

y_pred = gb_classifier.predict(X_test)
y_prob = gb_classifier.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 0.98
ROC AUC: 0.7040816326530612
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        98
           1       0.00      0.00      0.00         2

    accuracy                           0.98       100
   macro avg       0.49      0.50      0.49       100
weighted avg       0.96      0.98      0.97       100
 [[98  0]
 [ 2  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [60]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_1_step']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
dt_classifier.fit(X_train, y_train)


y_pred = dt_classifier.predict(X_test)
y_prob = dt_classifier.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 0.92
ROC AUC: 0.8971631205673759
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.93      0.96        94
           1       0.42      0.83      0.56         6

    accuracy                           0.92       100
   macro avg       0.70      0.88      0.76       100
weighted avg       0.95      0.92      0.93       100
 [[87  7]
 [ 1  5]]


In [61]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(slavery['cleaned_text'])
y = slavery['Slavery_2_expert']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
dt_classifier.fit(X_train, y_train)


y_pred = dt_classifier.predict(X_test)
y_prob = dt_classifier.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n",
      classification_report(y_test, y_pred),
      confusion_matrix(y_test, y_pred))

Accuracy: 0.97
ROC AUC: 0.49489795918367346
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98        98
           1       0.00      0.00      0.00         2

    accuracy                           0.97       100
   macro avg       0.49      0.49      0.49       100
weighted avg       0.96      0.97      0.97       100
 [[97  1]
 [ 2  0]]
