In [1]:
import logging
import pandas as pd
import numpy as np
import click
import torch
import transformers
import os
import re
import shutil
import subprocess
import requests
import pathlib as Path
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import hf_hub_download
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline 
import llama_cpp as LlamaCpp
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate 
from langchain.chains import LLMChain 
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM 
from transformers import GenerationConfig 
from transformers import LlamaForCausalLM 
from transformers import LlamaTokenizer 
from transformers import LongformerTokenizer 
from transformers import pipeline 
import rouge
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tqdm as tqdm
from termcolor import colored 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
token = "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"

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token = token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token = token)

: 

In [None]:
def load_model(device_type, model_id, model_basename=None):

    logging.info(f"Loading Model: {model_id}, on: {device_type}")
    logging.info("This action can take a few minutes!")

    if model_basename is not None:
        if ".ggml" in model_basename:
            logging.info("Using Llamacpp for GGML quantized models")
            model_path = hf_hub_download(repo_id=model_id, filename=model_basename, token=token)
            max_ctx_size = 4096
            kwargs = {
                "model_path": model_path,
                "n_ctx": max_ctx_size,
                "max_tokens": max_ctx_size,
            }
            if device_type.lower() == "mps":
                kwargs["n_gpu_layers"] = 1000
            if device_type.lower() == "cuda":
                kwargs["n_gpu_layers"] = 1000
                kwargs["n_batch"] = max_ctx_size
            return LlamaCpp(**kwargs)

        else:
            logging.info("Using AutoGPTQForCausalLM for quantized models")

            if ".safetensors" in model_basename:
                # Remove the ".safetensors" ending if present
                model_basename = model_basename.replace(".safetensors", "")

            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
            logging.info("Tokenizer loaded")

            model = AutoGPTQForCausalLM.from_quantized(
                model_id,
                model_basename=model_basename,
                use_safetensors=True,
                trust_remote_code=True,
                device="cuda:0",
                use_triton=False,
                quantize_config=None,
            )
    elif (device_type.lower() == "cuda"):  
        logging.info("Using AutoModelForCausalLM for full models")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        logging.info("Tokenizer loaded")

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
        )
        model.tie_weights()
    else:
        logging.info("Using LlamaTokenizer")
        tokenizer = LlamaTokenizer.from_pretrained(model_id, token = token)
        model = LlamaForCausalLM.from_pretrained(model_id, token = token)

    generation_config = GenerationConfig.from_pretrained(model_id, token=token)

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=2048,
        temperature=0,
        top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
        token = token,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)
    logging.info("Local LLM Loaded")

    return local_llm

In [None]:
DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
SHOW_SOURCES = True
logging.info(f"Running on: {DEVICE_TYPE}")
logging.info(f"Display Source Documents set to: {SHOW_SOURCES}")
model_id = "TheBloke/Llama-2-7B-Chat-GGML"
model_basename = "llama-2-7b-chat.ggmlv3.q4_0.bin"
LLM = load_model(device_type=DEVICE_TYPE, model_id=model_id, model_basename=model_basename)

llama.cpp: loading model from /Users/minijain/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGML/snapshots/76cd63c351ae389e1d4b91cab2cf470aab11864b/llama-2-7b-chat.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.07 MB
llama_model_load_internal: mem required  = 5407.71 MB (+ 1026.00 MB per state)
llama_new_context_with_model: kv self size  = 2048.00 MB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI =

In [None]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096", token = token)

def clean_text(text):
    # Remove special characters except "."
    text = re.sub(r'[^A-Za-z0-9\s.\(\)\[\]\{\}]+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def count_tokens(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return len(tokens)

In [41]:
import pandas as pd
import numpy as np

In [42]:
# below is the datatset annotated by Jay
data = pd.read_csv('final_dataset_06102022 2.csv')

In [43]:
data.head(5)
data.shape

(846, 96)

In [44]:
strs = ["end his life", "end her life", "end my life", "end their lives", "ended his life", "ended her life", "ended my life", "ended their lives", "ending his life", "ending her life", "ending my life", "ending their lives", "ends his life", "ends her life", "ends my life", "ends their lives", "kill oneself", "kill himself", "kill herself", "kill theirselves", "kill myself", "killed oneself", "killed himself", "killed herself", "killed theirselves", "killed myself", "killing oneself", "killing himself", "killing herself", "killing theirselves", "killing myself", "kills oneself", "kills himself", "kills herself", "kills theirselves", "kills myself", "murder oneself", "murder himself", "murder herself", "murder theirselves", "murder myself", "murdered oneself", "murdered himself", "murdered herself", "murdered theirselves", "murdered myself", "murdering oneself", "murdering himself", "murdering herself", "murdering theirselves", "murdering myself", "murders oneself", "murders himself", "murders herself", "murders theirselves", "murders myself", "poison oneself", "poison himself", "poison herself", "poison theirselves", "poison myself", "poisoned oneself", "poisoned himself", "poisoned herself", "poisoned theirselves", "poisoned myself", "poisoning oneself", "poisoning himself", "poisoning herself", "poisoning theirselves", "poisoning myself", "poisons oneself", "poisons himself", "poisons herself", "poisons theirselves", "poisons myself", "drown oneself", "drown himself", "drown herself", "drown theirselves", "drown myself", "drowned oneself", "drowned himself", "drowned herself", "drowned theirselves", "drowned myself", "drowning oneself", "drowning himself", "drowning herself", "drowning theirselves", "drowning myself", "drowns oneself", "drowns himself", "drowns herself", "drowns theirselves", "drowns myself", "strangle oneself", "strangle himself", "strangle herself", "strangle theirselves", "strangle myself", "strangled oneself",  "strangled himself", "strangled herself", "strangled theirselves", "strangled myself", "strangling oneself", "strangling himself", "strangling herself", "strangling theirselves", "strangling myself", "strangles oneself", "strangles himself", "strangles herself", "strangles theirselves", "strangles myself", "suffocate oneself", "suffocate himself", "suffocate herself", "suffocate theirselves", "suffocate myself", "suffocated oneself", "suffocated himself", "suffocated herself", "suffocated theirselves", "suffocated myself", "suffocating oneself", "suffocating himself", "suffocating herself", "suffocating theirselves", "suffocating myself", "suffocates oneself", "suffocates himself", "suffocates herself", "suffocates theirselves", "suffocates myself", "jump to his death", "jump to her death", "jump to their death", "jump to my death", "jumped to his death", "jumped to her death", "jumped to their death", "jumped to my death", "jumping to his death", "jumping to her death", "jumping to their death", "jumping to my death", "jumps to his death", "jumps to her death", "jumps to their death", "jumps to my death", "hang himself", "hang oneself", "hang herself", "hang themselves", "hang myself", "hung himself", "hung oneself", "hung herself", "hung themselves", "hung myself", "hanging himself", "hanging oneself", "hanging herself", "hanging themselves", "hanging myself", "hangs himself", "hangs oneself", "hangs herself", "hangs themselves", "hangs myself", "hungs himself", "hungs oneself", "hungs herself", "hungs themselves", "hungs myself", "shoot oneself", "shoot himself", "shoot herself", "shoot myself", "shoot theirselves", "shot oneself", "shot himself", "shot herself", "shot myself", "shot theirselves", "shooting oneself", "shooting himself", "shooting herself", "shooting myself", "shooting theirselves", "shoots oneself", "shoots himself", "shoots herself", "shoots myself", "shoots theirselves", "shots oneself", "shots himself", "shots herself", "shots myself", "shots theirselves", "die by his hand", "die by his own hand", "die by her hand", "die by her own hand", "die by their hands", "die by their own hands", "die by my hand", "die by my own hand", "take his life", "take his own life", "take her life", "take her own life", "take their lives", "take their own lives", "take my life", "take my own life", "took his life", "took his own life", "took her life", "took her own life", "took their lives", "took their own lives", "took my life", "took my own life", "takes his life", "takes his own life", "takes her life", "takes her own life", "takes their lives", "takes their own lives", "takes my life", "takes my own life", "taking his life", "taking his own life", "taking her life", "taking her own life", "taking their lives", "taking their own lives", "taking my life", "taking my own life","cut oneself", "cut himself", "cut herself", "cut theirselves", "cut myself", "cuts oneself", "cuts himself", "cuts herself", "cuts theirselves", "cuts myself", "cutting oneself", "cutting himself", "cutting herself", "cutting theirselves", "cutting myself", "stab oneself", "stab himself", "stab herself", "stab theirselves", "stab myself", "stabs oneself", "stabs himself", "stabs herself", "stabs theirselves", "stabs myself", "stabbing oneself", "stabbing himself", "stabbing herself", "stabbing theirselves", "stabbing myself", "stabbed oneself", "stabbed himself", "stabbed herself", "stabbed theirselves", "stabbed myself", "cut his throat", "cut her throat", "cut my throat", "cuts his throat", "cuts her throat", "cuts my throat", "cutting his throat", "cutting her throat", "cutting my throat", "cut his wrist", "cut her wrist", "cut my wrist", "cuts his wrist", "cuts her wrist", "cuts my wrist", "cutting his wrist", "cutting her wrist", "cutting my wrist", "slit his throat", "slit her throat", "slit my throat", "slits his throat", "slits her throat", "slits my throat", "slitting his throat", "slitting her throat", "slitting my throat", "slit his wrist", "slit her wrist", "slit my wrist", "slits his wrist", "slits her wrist", "slits my wrist", "slitting his wrist", "slitting her wrist", "slitting my wrist"]

In [45]:
data = data[data['Yes'] == 1]

In [46]:
pattern = r'^news[0-6]_'
datacopy = data.copy()
datacopy['Document name'] = data['Document name'].str.replace(pattern, '', regex=True)

In [47]:
data.head(3)
print(data.shape)

(200, 96)


In [48]:
import torch  
from transformers import BertTokenizer,BertModel

In [49]:
from sentence_transformers import SentenceTransformer,util
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

# Define your sentence transformer model using CLS pooling
#model_name = 'sentence-transformers/all-MiniLM-L6-v2'
#word_embedding_model = models.Transformer(model_name)
#print(word_embedding_model.get_word_embedding_dimension())
#pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [50]:
print(model.max_seq_length)
#model.max_seq_length = 500
print(model.max_seq_length)

384
384


In [51]:
FinancialLabel = "Financial/Job Problem"
Legallabel = "Legal Problem"
SchoolLabel = "School or Academic Related Problem"
HealthLabel = "Lack of Access to Health/Mental Health Care"

In [52]:
relevant = data[data['Yes']==1]
print(relevant.shape)
print(data.shape)

financial_problem = data[data[FinancialLabel] > 0]
school_problem = data[data[SchoolLabel] > 0]
legal_problem = data[data[Legallabel] > 0]
health_problem = data[data[HealthLabel] > 0]
unspecified_circumstance = data[data['Unspecified Circumstance'] > 0]

(200, 96)
(200, 96)


In [53]:
# gpt-3.5-turbo-16k
school_problem_df = pd.DataFrame(school_problem, columns=['Document group', 'Document name'])
arrayoftexts = []
wordcount = 0.0
for documentname in school_problem_df['Document name']:
    #print(documentname)
    with open("articles/"+ documentname + ".txt") as f:
        text = f.readlines()
        #text[0] = text_summarize(text[0])
        #print(text[0])
        arrayoftexts.append(text[0])
        wordcount += len(text[0].split())
avgwordcount = wordcount/len(arrayoftexts)

school_problem_df['Text'] = arrayoftexts

school_problem_embeddings = []

for text in school_problem_df['Text']:
    embedding = model.encode(text)
    school_problem_embeddings.append(embedding)

school_problem_averaged_embedding = np.mean(school_problem_embeddings, axis = 0)
print("Average word count: ", avgwordcount)


Average word count:  1285.6


In [54]:
cleanedtext = [clean_text(arr) for arr in arrayoftexts]
tokencount = [count_tokens(arr) for arr in cleanedtext]

In [55]:
print(tokencount)

[1079, 717, 754, 3795, 1040]


In [56]:


def generate_summary(text_chunk):
    # Defining the template to generate summary
    template = """
    Extract the sentences from the text that are talking about suicide or cause of suicide.
    ```{text}```
    """
    prompt = PromptTemplate(template=template, input_variables=["text"])
    llm_chain = LLMChain(prompt=prompt, llm=LLM)
    print("chain called")
    summary = llm_chain.run(text_chunk)
    print("chain returned")
    return summary

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=50, length_function=len)

summaries = []

for text  in cleanedtext:
    print("hello")
    chunks = text_splitter.split_text(text)
    print(len(chunks))
    chunk_summaries = []
    for chunk in chunks:
        summary = generate_summary(chunk)
        chunk_summaries.append(summary)
    combined_summary = "\n".join(chunk_summaries)
    summaries.append(combined_summary)
    

hello
2
chain called


KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from exllama import ExModel, ExTokenizer

tokenizer = AutoTokenizer.from_pretrained("turboderp/llama2")
model = ExModel.from_pretrained("turboderp/llama2", tokenizer=tokenizer)

ModuleNotFoundError: No module named 'exllama'

In [None]:
print(arrayoftexts)
print(summaries)



In [None]:
legal_problem_df = pd.DataFrame(legal_problem, columns=['Document group', 'Document name'])
arrayoftexts = []
wordcount = 0.0
for documentname in legal_problem_df['Document name']:
    #print(documentname)
    with open("articles/"+ documentname + ".txt") as f:
        text = f.readlines()
        arrayoftexts.append(text[0])
        wordcount += len(text[0].split())
avgwordcount = wordcount/len(arrayoftexts)

legal_problem_df['Text'] = arrayoftexts

legal_problem_embeddings = []

for text in legal_problem_df['Text']:
    embedding = model.encode(text)
    legal_problem_embeddings.append(embedding)

legal_problem_averaged_embedding = np.mean(legal_problem_embeddings, axis = 0)
print("Average word count: ", avgwordcount)


Average word count:  1058.4615384615386


In [None]:
financial_problem_df = pd.DataFrame(financial_problem, columns=['Document group', 'Document name'])
arrayoftexts = []
wordcount = 0.0
for documentname in financial_problem_df['Document name']:
    #print(documentname)
    with open("articles/"+ documentname + ".txt") as f:
        text = f.readlines()
        arrayoftexts.append(text[0])
        wordcount += len(text[0].split())
avgwordcount = wordcount/len(arrayoftexts)

financial_problem_df['Text'] = arrayoftexts

financial_problem_embeddings = []

for text in financial_problem_df['Text']:
    embedding = model.encode(text)
    financial_problem_embeddings.append(embedding)

financial_problem_averaged_embedding = np.mean(financial_problem_embeddings, axis = 0)
print("Average word count: ", avgwordcount)


Average word count:  1497.6153846153845


In [None]:
health_problem_df = pd.DataFrame(health_problem, columns=['Document group', 'Document name'])
arrayoftexts = []
wordcount = 0.0
for documentname in health_problem_df['Document name']:
    #print(documentname)
    with open("articles/"+ documentname + ".txt") as f:
        text = f.readlines()
        arrayoftexts.append(text[0])
        wordcount += len(text[0].split())
avgwordcount = wordcount/len(arrayoftexts)

health_problem_df['Text'] = arrayoftexts

health_problem_embeddings = []

for text in health_problem_df['Text']:
    embedding = model.encode(text)
    health_problem_embeddings.append(embedding)

health_problem_averaged_embedding = np.mean(health_problem_embeddings, axis = 0)
print("Average word count: ", avgwordcount)


Average word count:  949.5714285714286


In [None]:
print(len(financial_problem_embeddings))
len(financial_problem_embeddings[0])

13


768

In [None]:
all_embeddings = financial_problem_embeddings + legal_problem_embeddings + school_problem_embeddings + health_problem_embeddings

labels = []
for i in range(len(financial_problem_embeddings)):
    labels.append(FinancialLabel)
for i in range(len(legal_problem_embeddings)):
    labels.append(Legallabel)
for i in range(len(school_problem_embeddings)):
    labels.append(SchoolLabel)
for i in range(len(health_problem_embeddings)):
    labels.append(HealthLabel)

In [None]:
import csv
texts = ["Financial/Job Problem", "Legal Problem", "School or Academic Related Problem", "Lack of Access to Health/Mental Health Care"] 
embeddingsize = len(financial_problem_averaged_embedding)
header = [i for i in range(embeddingsize)]
header.insert(0, "Structural Violence Factor")



In [None]:
embeddings_final = [financial_problem_averaged_embedding.tolist(), legal_problem_averaged_embedding.tolist(), school_problem_averaged_embedding.tolist(), health_problem_averaged_embedding.tolist()]
for i in range(len(embeddings_final)):
    embeddings_final[i].insert(0, texts[i])

with open('embeddings.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(embeddings_final)

In [None]:
# Dataset from ben Horne
bendata = pd.read_csv('Transfer/article_matches.csv')
bendata.head(3)
print(bendata.shape)

(19386, 3)


In [None]:
bendata = bendata[~bendata['content'].isna()]
bendata.shape
suicide_Articles_from_ben_data = bendata['content'].tolist()

wordcount = 0.0
for art in suicide_Articles_from_ben_data:
    
    try:
        wordcount += len(art.split())
    except:
        print(art)
print("Average word count: ", wordcount/len(suicide_Articles_from_ben_data))

Average word count:  1126.8050869318474


In [None]:
print(bendata.shape)

(19383, 3)


In [None]:
import pickle

#Load sentences & embeddings from disc
with open('ben_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_bendata_sentences = stored_data['sentences']
    stored_bendata_embeddings = stored_data['embeddings']

In [None]:
print(len(stored_bendata_sentences))
print(len(stored_bendata_embeddings))

19383
19383


In [None]:
cos_sim_bendata_financial = util.cos_sim(stored_bendata_embeddings, financial_problem_averaged_embedding)
cos_sim_bendata_legal = util.cos_sim(stored_bendata_embeddings, legal_problem_averaged_embedding)
cos_sim_bendata_school = util.cos_sim(stored_bendata_embeddings, school_problem_averaged_embedding)
cos_sim_bendata_health = util.cos_sim(stored_bendata_embeddings, health_problem_averaged_embedding)

In [None]:
print("Max similarity for financial problem: ", torch.max(cos_sim_bendata_financial))
print("Max similarity for legal problem: ", torch.max(cos_sim_bendata_legal))
print("Max similarity for school problem: ", torch.max(cos_sim_bendata_school))
print("Max similarity for health problem: ", torch.max(cos_sim_bendata_health))

Max similarity for financial problem:  tensor(0.7427)
Max similarity for legal problem:  tensor(0.7710)
Max similarity for school problem:  tensor(0.8391)
Max similarity for health problem:  tensor(0.8231)


In [None]:
# print(suicide_Articles_from_ben_data[torch.argmax(cos_sim_bendata_financial).item()])
# print("next")
# print(suicide_Articles_from_ben_data[torch.argmax(cos_sim_bendata_legal).item()])
# print("next")
# print(suicide_Articles_from_ben_data[torch.argmax(cos_sim_bendata_school).item()])
# print("next")
# print(suicide_Articles_from_ben_data[torch.argmax(cos_sim_bendata_health).item()])

In [None]:
#MAJORITY VOTING BASED CLASSIFICATION

#Find the pairs with the highest cosine similarity scores
cosine_scores = util.cos_sim(stored_bendata_embeddings, all_embeddings)
print(len(cosine_scores))
pairs = []
for i in range(len(cosine_scores)):
    for j in range(len(cosine_scores[0])):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

19383


  b = torch.tensor(b)


In [None]:
headerrow = ["Label", "Score", "Article"]
rows = []
for pair in pairs[0:10]:
    i, j = pair['index']
    rows.append([pair['score'].item(), labels[j], stored_bendata_sentences[i]])

with open('bens_dataset_analysis_top10_highest_matches.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headerrow)
    writer.writerows(rows)


In [None]:
# print(financial_articles[np.argmax(np.array(financial_scores))])
# print("Next")
# print(legal_articles[np.argmax(np.array(legal_scores))])
# print("Next")
# print(school_articles[np.argmax(np.array(school_scores))])
# print("Next")
# print(health_articles[np.argmax(np.array(health_scores))])

In [None]:
from collections import Counter
topk = 5
threshhold = 0.5
with open('bens dataset - all class scores.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Id", "Article", FinancialLabel, Legallabel, SchoolLabel, HealthLabel])
    for i in range(len(cosine_scores)):
        articlestats = cosine_scores[i]
        financial_score = torch.max(articlestats[np.argwhere(np.array(labels) == FinancialLabel).flatten()]).item()
        legal_score = torch.max(articlestats[np.argwhere(np.array(labels) == Legallabel).flatten()]).item()
        school_score = torch.max(articlestats[np.argwhere(np.array(labels) == SchoolLabel).flatten()]).item()
        health_score = torch.max(articlestats[np.argwhere(np.array(labels) == HealthLabel).flatten()]).item()        
        arr = [i+1, stored_bendata_sentences[i],financial_score, legal_score, school_score, health_score]
        writer.writerow(arr)
