# Import required libraries

In [None]:
!pip install transformers torch accelerate tensorflow-hub bert-tensorflow tensorflow tqdm

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, MarianMTModel, MarianTokenizer, BertConfig
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification
from transformers import LlamaTokenizer, LlamaForCausalLM
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from datetime import datetime
from torch.utils.data import DataLoader
import re
import nltk
from nltk.corpus import wordnet
import random
from tqdm import tqdm
import concurrent.futures
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import json
import numpy as np
from google.colab import drive
drive.mount("/content/drive")


In [None]:
from transformers import pipeline

# Re-read all sentences with labels

In [None]:
csv_filenmame = "/content/drive/My Drive/Diss_Dataset/all_labels_combined.csv"
sentences2_df = pd.read_csv(csv_filenmame)

# Import RoBERTa sentiment classifier

In [None]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_analysis = pipeline('sentiment-analysis', model=model_name, device=0)
unique_user_ids = sentences2_df['userid'].unique()
all_top_sentences = []

# Produce list of 5 most suicidal/negative sentences for each user

In [None]:
for user_id in unique_user_ids:
    user_sentences = sentences2_df[(sentences2_df['userid'] == user_id) &
                                   (sentences2_df['predicted_supportiveness_label'] == 0) &
                                   (sentences2_df['rating'] > 1)]

    category_counts = user_sentences['rating'].value_counts().sort_index(ascending=False)
    total_sentences = 0
    selected_categories = []

    for rating, count in category_counts.items():
        total_sentences += count
        selected_categories.append(rating)
        if total_sentences > 5:
            break
    if not selected_categories:
        print(f"No sentences found for user {user_id} after filtering. Skipping this user.")
        continue
    lowest_category = min(selected_categories)
    lowest_category_sentences = user_sentences[user_sentences['rating'] == lowest_category]
    lowest_category_sentences['negative_score'] = lowest_category_sentences['sentence'].apply(
        lambda x: sentiment_analysis(x)[0]['score'] if sentiment_analysis(x)[0]['label'].lower() == 'negative' else 0
    )

    lowest_category_sentences = lowest_category_sentences.sort_values(by='negative_score', ascending=False)
    top_sentences = user_sentences[user_sentences['rating'].isin(selected_categories[:-1])]
    top_sentences = pd.concat([top_sentences, lowest_category_sentences.head(5 - len(top_sentences))])
    top_sentences = top_sentences.head(5)
    all_top_sentences.append(top_sentences)

# Save results to a CSV file

In [None]:
final_top_sentences = pd.concat(all_top_sentences)
final_top_sentences.to_csv('/content/drive/My Drive/Diss_Dataset/top_sentences_for_all_users.csv', index=False)

print(final_top_sentences)

# Function to generate a risk rating for each user

In [None]:
def calculate_user_risk_rating(user_sentences, supportiveness_ratio):
    if user_sentences.empty:
        return 0

    rating_counts = user_sentences['rating'].value_counts()
    max_rating = user_sentences['rating'].max()
    print(user_sentences)
    if supportiveness_ratio >= 0.9:
        if max_rating < 4:
            return 1

    ratings_of_4_or_5 = user_sentences[user_sentences['rating'].isin([4, 5])]
    print(ratings_of_4_or_5)
    all_other_ratings_are_2 = user_sentences[user_sentences['rating'] != 2].shape[0] == len(ratings_of_4_or_5)
    if supportiveness_ratio >= 0.87 and len(ratings_of_4_or_5) == 1 and all_other_ratings_are_2:
        return 1
    if supportiveness_ratio >= 0.87 and len(user_sentences) in {1,2}:
        return 1

    if 4 in user_sentences['rating'].values or 5 in user_sentences['rating'].values:
        print("hi")
        if len(ratings_of_4_or_5) >= 1:
            print("hi2")
            if user_sentences['rating'].mean() > 4.5:
                return 5
            if user_sentences['rating'].mean() > 3.5:
                return 4
            else:
                return max(3, int(user_sentences['rating'].mean()))

    num_rated_3 = (user_sentences['rating'] == 3).sum()
    num_sentences = len(user_sentences)
    if num_rated_3 > (num_sentences * 0.3):
        return 3
    return int(user_sentences['rating'].mean())

# Saving user risk levels to CSV file

In [None]:
user_risk_ratings = {}
user_supportiveness_ratios = {}

for user_id in sentences2_df['userid'].unique():
    user_sentences = final_top_sentences[final_top_sentences['userid'] == user_id]
    supportive_sentences = sentences2_df[(sentences2_df['userid'] == user_id) & (sentences2_df['predicted_supportiveness_label'] == 1)]
    all_sentences = sentences2_df[sentences2_df['userid'] == user_id]

    supportive_ratio = len(supportive_sentences) / len(all_sentences) if len(all_sentences) > 0 else 0
    user_risk_ratings[user_id] = calculate_user_risk_rating(user_sentences, supportive_ratio)
    user_supportiveness_ratios[user_id] = supportive_ratio
user_risk_df = pd.DataFrame({
    'userid': list(user_risk_ratings.keys()),
    'risk_rating': list(user_risk_ratings.values()),
    'supportiveness_ratio': list(user_supportiveness_ratios.values())
})

user_risk_df.to_csv('/content/drive/My Drive/Diss_Dataset/user_risk_with_supportiveness.csv', index=False)

final_top_sentences_sorted = final_top_sentences.sort_values(by=['userid', 'rating'], ascending=[True, False])
final_top_sentences.to_csv('/content/drive/My Drive/Diss_Dataset/top_5_sentences_per_user.csv', index=False)


print(user_risk_df)