In [1]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

language_detector = pipeline('text-classification', model='papluca/xlm-roberta-base-language-detection')

translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

rus_toxicity_classifier = pipeline('text-classification', model='cointegrated/rubert-tiny-toxicity')
rus_sentiment_classifier = pipeline('sentiment-analysis', model='r1char9/rubert-base-cased-russian-sentiment')

en_toxicity_classifier = pipeline('text-classification', model='martin-ha/toxic-comment-model')
en_sentiment_classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

zero_shot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

print("All models are succesfully loaded!")

SPAM_MARKERS = {"подпис", "канал", "заход", "переход", "профил", "ссылк", "заработ", "крипт"}
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')


def clean_text(text: str) -> str:
    text = re.sub(r'[^\w\s]', '', text)
    return " ".join(text.split())

def is_kazakh(text: str) -> bool:
    KAZAKH_CHARS = set("ӘәҒғҚқҢңӨөҰұҮүҺһІі")
    return any(char in KAZAKH_CHARS for char in text)

def translate_to_russian(text: str, src_lang_code: str = "kaz_Cyrl") -> str:
    try:
        translator_tokenizer.src_lang = src_lang_code
        encoded_text = translator_tokenizer(text, return_tensors="pt")
        target_lang_id = translator_tokenizer.get_lang_id("rus_Cyrl")
        generated_tokens = translator_model.generate(**encoded_text, forced_bos_token_id=target_lang_id)
        return translator_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    except Exception as e:
        print(f"Translation error!: {e}")
        return text

def detect_spam_by_rules(text: str) -> bool:
    text_lower = text.lower()
    if URL_PATTERN.search(text_lower):
        return True
    if any(marker in text_lower for marker in SPAM_MARKERS):
        return True
    return False

def get_moderation_verdict(text: str, language: str) -> str:
    if detect_spam_by_rules(text):
        return 'spam'
    
    toxicity_result = 'non-toxic'
    if language in ['ru', 'kk']: 
        toxicity_result = rus_toxicity_classifier(text)[0]['label']
    elif language == 'en':
        toxicity_result = en_toxicity_classifier(text)[0]['label']
    
    return 'insult' if toxicity_result == 'toxic' else 'ok'



def analyze_comment(comment_text: str) -> dict:
 
    cleaned_text = clean_text(comment_text)
    if not cleaned_text:
        return {
            'text': comment_text, 'language': 'unknown', 'moderation_verdict': 'ok',
            'sentiment': 'N/A', 'comment_type': 'N/A'
        }

    if is_kazakh(cleaned_text):
        detected_language = 'kk'
    else:
        lang_results = language_detector(cleaned_text, top_k=1)
        detected_language = lang_results[0]['label']
            
    sentiment = "N/A"
    comment_type = "N/A"
    text_to_analyze = cleaned_text
    
    if detected_language == 'kk':
        text_to_analyze = translate_to_russian(cleaned_text, src_lang_code="kaz_Cyrl")

    moderation_verdict = get_moderation_verdict(text_to_analyze, detected_language)

    if moderation_verdict != 'spam':
        if detected_language in ['ru', 'kk']:
            sentiment = rus_sentiment_classifier(text_to_analyze)[0]['label']
        elif detected_language == 'en':
            sentiment = en_sentiment_classifier(text_to_analyze)[0]['label']
        
        descriptive_labels = [
        "The user asks a question for telecommunication company," "The user complains about the service for telecommunication company,"
        "The user expresses gratitude for telecommunication company," "The user shares their opinion for telecommunication company"
        ]
        label_map = {
            "The user asks a question for telecommunication company": "question",
            "The user complains about the service for telecommunication company": "complaint",
            "The user expresses gratitude for telecommunication company": "gratitude",
            "The user shares their opinion for telecommunication company": "feedback"
        }
        
        type_result = zero_shot_classifier(text_to_analyze, descriptive_labels)
        top_label = type_result['labels'][0]
        comment_type = label_map.get(top_label, "feedback")

    final_analysis = {
        'text': comment_text,
        'language': detected_language,
        'moderation_verdict': moderation_verdict,
        'sentiment': sentiment,
        'comment_type': comment_type
    }
    return final_analysis

print("\n✅ Analytic module is ready!")

ModuleNotFoundError: No module named 'transformers'

In [None]:
import json
from tqdm import tqdm

sample_data = {
    "comments": [
        {
            "id": 1,
            "author": "Анна",
            "text": "Спасибо вам большое, все отлично работает!"
        },
        {
            "id": 2,
            "author": "Иван",
            "text": "Ужасный интернет, постоянно пропадает!"
        },
        {
            "id": 3,
            "author": "Spammer",
            "text": "Заходи на мой канал про крипту www.example.com"
        },
        {
            "id": 4,
            "author": "Марат",
            "text": "Керемет! Маған бәрі ұнады, рахмет сіздерге."
        },
        {
            "id": 5,
            "author": "John",
            "text": "Could you please tell me about your new tariffs?"
        }
    ]
}

INPUT_JSON_PATH = 'comments_input.json'
with open(INPUT_JSON_PATH, 'w', encoding='utf-8') as f:
    json.dump(sample_data, f, ensure_ascii=False, indent=4)
    
print(f"Создан тестовый файл '{INPUT_JSON_PATH}'")



OUTPUT_JSON_PATH = 'comments_output.json'

def process_comments_from_json(input_path: str, output_path: str):
    
    print(f"\nDownlading comments: {input_path}")
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            comments_to_process = data.get('comments', [])
    except FileNotFoundError:
        print(f"ERROR: Input file hasn't been finded in path {input_path}")
        return
    except json.JSONDecodeError:
        print(f"ERROR: Incorrect format JSON in file {input_path}")
        return

    if not comments_to_process:
        print("Couldn't fing comments for analysis.")
        return

    print(f"Finded {len(comments_to_process)} comments. Starting analysis...")
    
    results = []
    for comment_data in tqdm(comments_to_process, desc="Comment analysis"):
        comment_text = comment_data.get('text', '')
        
        if comment_text:
            analysis = analyze_comment(comment_text)
            
            full_result = {**comment_data, **analysis}
            results.append(full_result)
        else:
            results.append(comment_data)

    output_data = {'results': results}
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ Analysis are over. Results saved in file: {output_path}")

process_comments_from_json(INPUT_JSON_PATH, OUTPUT_JSON_PATH)