In [None]:
!pip install transformers==4.41.1

In [None]:
from kaggle_secrets import UserSecretsClient


In [None]:
import more_itertools
import torch
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from huggingface_hub import login
from transformers import pipeline
from sklearn.metrics import roc_auc_score


In [None]:
class reddit_evaluator:
    def __init__(self,model):
        #self.rawdata = pd.read_csv('test.csv')
        self.model_name = model
        self.final_data = pd.DataFrame
        self.load_model()


    def HFtokenloader(self):
        # Loading Token
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HF_TOKEN")   # use the label you created

       # hf_token = userdata.get('HF_TOKEN')
        login(hf_token#, add_to_git_credential=True
         )

    def load_model(self):#"mistralai/Mistral-7B-Instruct-v0.2"
        # Load tokenizer and model
        self.HFtokenloader()
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"      # automatically uses GPU/CPU
        )

    def get_emotion_sentiment(self):
        # Emotion-based
        return pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", return_all_scores=True)

    def get_domain_sentiment(self):
        # Domain-based
        return pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True)

    def get_sentiment(self,sent_model, input: pd.Series):
            return "".join([str(x['label'])+ " : "+ str(x['score']) for x in sent_model(input['subreddit'])[0]])

    def get_sentimentdata(self):

        #train_data = pd.read_csv('train.csv')
        emotion_pipe = self.get_emotion_sentiment()
        domain_pipe = self.get_domain_sentiment()
        domain_pipedata=[]
        emotion_pipedata=[]
        for batch in more_itertools.batched(self.rawdata.iterrows(), 4):
            emotion_pipedata+= [self.get_sentiment(emotion_pipe, x) for _, x in batch]
            domain_pipedata+= [self.get_sentiment(domain_pipe, x) for _, x in batch]
        self.rawdata['emotion_pipe']=emotion_pipedata
        self.rawdata['domain_pipe']=domain_pipedata

    def prompt(self,input: pd.Series):

        return """<start_of_turn>user
              You are a really experienced moderator for the subreddit /r/%s. Your job
              is to determine if the following reported comments violates the rule:
              %s

              %s
              Decision:
              True

              %s
              Decision:
              False

              %s
              Decision:
              False

              %s
              Decision:
              True

              %s

              Emotion- based Sentiments":
              %s

              Domain-based Sentiments:
              %s

              <end_of_turn>
              <start_of_turn>model\n""" % (
                  input['subreddit'],
                  input['rule'],
                  "\n".join(["| " + x for x in input['positive_example_1'].split('\n')]),
                  "\n".join(["| " + x for x in input['negative_example_1'].split('\n')]),
                  "\n".join(["| " + x for x in input['negative_example_2'].split('\n')]),
                  "\n".join(["| " + x for x in input['positive_example_2'].split('\n')]),
                  "\n".join(["| " + x for x in input['body'].split('\n')]),
                  "\n".join(input['emotion_pipe']),
                  "\n".join(input['domain_pipe'])
              )

    def get_submission(self):
        self.final_data  = self.rawdata[["row_id","response"]]
        self.final_data.rename(columns={"response":"rule_violation"}, inplace=True)
        self.final_data.to_csv("submission.csv", index=False)


    def predvalidation(self,threshold =0.5):
        self.rawdata["pred"] = [x > 0.46 for x in self.rawdata["response"]]
        auc_score = roc_auc_score(self.rawdata["rule_violation"], self.rawdata["pred"] )
        print(f"The ROC AUC score is: {auc_score}")
        return auc_score
#train_data
    def huggingFace_llmclassfier(self, data, traindata =0):
        self.rawdata = data
        responses = []
        device = "cuda" if torch.cuda.is_available() else "cpu"

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.get_sentimentdata()
        token_ids = [self.tokenizer.get_vocab()[word] for word in ['True', 'False']]
        if any(token_id == self.tokenizer.get_vocab()['<unk>'] for token_id in token_ids):
              raise ValueError('One of the target classes is not in the vocabulary.')
        for batch in more_itertools.batched(self.rawdata.iterrows(), 4):
            prompts = [self.prompt(x) for _, x in batch]
            pre = self.tokenizer(text=prompts, return_tensors="pt", padding=True,
                            truncation=True,
                            max_length=512).to(device)
            with torch.no_grad():
              outputs = self.model(**pre)
            logits = outputs.logits[:, -1, token_ids]
            probabilities = torch.softmax(logits, dim=-1)
            responses.extend(probabilities[:, 0].tolist())

        self.rawdata["response"] = responses
        if traindata==0:
            self.get_submission()
        return self.final_data



In [None]:
HF_mistrial = reddit_evaluator("mistralai/Mistral-7B-Instruct-v0.2")


In [None]:
import pandas as pd
test_data = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
test_analysis = HF_mistrial.huggingFace_llmclassfier(test_data)

In [19]:
test_analysis.to_csv("submission.csv", index=False)