# Import required libraries

In [None]:
!pip install transformers torch accelerate tensorflow-hub bert-tensorflow tensorflow tqdm bert-score

In [None]:
!pip install datasets

In [None]:
!pip install rouge-score

In [None]:
!pip install sacrebleu

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, MarianMTModel, MarianTokenizer, BertConfig
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import pipeline
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from datetime import datetime
from torch.utils.data import DataLoader
import re
import nltk
from nltk.corpus import wordnet
import random
from tqdm import tqdm
import concurrent.futures
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import json
import numpy as np
from bert_score import score
from google.colab import drive
drive.mount("/content/drive")


# Importing full posts

In [None]:
nltk.download('punkt')
dataset_path = "/content/drive/My Drive/Diss_Dataset/dataset500_cleaned.csv"
data = pd.read_csv(dataset_path)
data["userid"] = data.iloc[:, 0]
data["posts"] = data.iloc[:, 1]
data["label"] = data.iloc[:, 2]

data['posts'] = data['posts'].str.strip('[]').str.split("', '")
data['posts'] = data['posts'].apply(lambda x: [post.strip("' ") for post in x])

def clean_post(post):
    post = re.sub(r'\*+', '', post)
    post = re.sub(r'\s+', ' ', post)
    post = re.sub(r'&gt;', '', post)
    post = re.sub(r'[^\x00-\x7F]+', '', post)
    post = re.sub(r'"[^"]*"', '', post)
    post = re.sub(r'\([^)]*\)', '', post)
    post = re.sub(r'[()]', '', post)
    post = re.sub(r'\[[^\]]*\]', '', post)
    post = re.sub(r'[\[\]]', '', post)
    return post.strip().lower()

data["clean_posts"] = data["posts"].apply(lambda posts: [clean_post(post) for post in posts])
data['combined_posts'] = data['clean_posts'].apply(lambda posts: ' '.join(posts))


print(data['combined_posts'][1])

# Importing top 5 sentences, risk rating and number of supportive sentences

In [None]:
top_5_path = "/content/drive/My Drive/Diss_Dataset/top_sentences_for_all_users.csv"
top_5_df = pd.read_csv(top_5_path)

user_risk_supportiveness_path = "/content/drive/My Drive/Diss_Dataset/user_risk_with_supportiveness.csv"
risk_supportiveness_df = pd.read_csv(user_risk_supportiveness_path)

# Creating meta-information

In [None]:
def create_meta_info(user_id, risk_df, top_5_df):
    risk_info = risk_df[risk_df['userid'] == user_id]
    top_5_info = top_5_df[top_5_df['userid'] == user_id]

    if not risk_info.empty:
        risk_val = risk_info['risk_rating'].values[0]
        risk = "Very High" if risk_val == 5 else "High" if risk_val == 4 else "Medium" if risk_val == 3 else "Low" if risk_val == 2 else "Very Low" if risk_val == 1 else "No"
        supportiveness_ratio = risk_info['supportiveness_ratio'].values[0]
    else:
        risk = "Unknown"
        supportiveness_ratio = "Unknown"

    top_5_sentences = ' '.join(top_5_info['sentence'].tolist())
    supportive_percentage = (supportiveness_ratio * 100).round(2)

    meta_info = f"The author indicates {risk} suicidal risk, with {supportive_percentage}% of their sentences supporting other users."
    return meta_info

# Loading the chosen Mistral model, and prompting method

In [None]:
model_name = "/content/drive/My Drive/Diss_Dataset/Mistral7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
prompt = """
[INST]
In one paragraph, summarise the provided text from the author on Reddit:\n
#Full input text:
{posts}\n
#Meta-Information:
{meta_info}\n
Summary: [/INST]

"""

# Generating a response from the model

In [None]:
def truncate_text(text, max_input_tokens=1800):
    tokens = tokenizer.tokenize(text)
    if len(tokens) > max_input_tokens:
        truncated_tokens = tokens[:max_input_tokens]
        truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)
        return truncated_text
    return text

reserved_tokens = 512

def extract_assistant_response(text):
    assistant_marker = "[/INST]"
    if assistant_marker in text:
        summary = text.split(assistant_marker)[1].strip()
    else:
        summary = text.strip()

    return summary

results = []

for user_id in risk_supportiveness_df['userid'].unique():

    full_text = data[data['userid'] == user_id]['combined_posts'].values[0]

    meta_info = create_meta_info(user_id, risk_supportiveness_df, top_5_df)

    truncated_text = truncate_text(full_text, max_input_tokens=2048 - reserved_tokens)

    input_text = prompt.format(posts=truncated_text, meta_info=meta_info)
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200)
    summary = extract_assistant_response(tokenizer.decode(outputs[0], skip_special_tokens=True))

    results.append({
        "userid": user_id,
        "summary": summary,
    })

results_df = pd.DataFrame(results)
results_df.to_csv("/content/drive/My Drive/Diss_Dataset/all_summaries.csv", index=False)

In [None]:
def extract_assistant_response(text):
    assistant_marker = "Summary:"
    if assistant_marker in text:
        summary = text.split(assistant_marker)[1].strip()
    else:
        summary = text.strip()

    summary = summary.split('\n')[0].strip()

    return summary

for row in results:
    summary = extract_assistant_response(row['summary'])
    row['summary'] = summary

results_df = pd.DataFrame(results)
print(results_df.head())

In [None]:
results_df.to_csv("/content/drive/My Drive/Diss_Dataset/summary_by_user_test.csv", index=False)