In [342]:
import os
import re
import pandas as pd
import numpy as np
import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from glob import glob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline

#

class TextAnalysis:
    def __init__(self, data_path, scores_path, train_dataset_path):
        self.stop_words = set(stopwords.words('english'))
        self.data_path = data_path
        self.scores_path = scores_path
        self.train_dataset_path = train_dataset_path
        self.df, self.code2prompts = self._load_and_process_data()
        self.labeled_data_df, self.clf, self.vectorizer = self._train_naive_bayes_classifier()
        self.models, self.evaluation = self._train_models_per_question()

    def _remove_stopwords(self, tokens):
        return [token for token in tokens if token.lower() not in self.stop_words]

    def _load_and_process_data(self, data_path=None):
        code2convos = dict()

        pbar = tqdm.tqdm(sorted(list(glob(self.data_path))))
        for path in pbar:
            # print(Path.cwd() / path)
            file_code = os.path.basename(path).split(".")[0]
            with open(path, "r", encoding="latin1") as fh:
                    
                # get the file id to use it as key later on
                fid = os.path.basename(path).split(".")[0]

                # read the html file
                html_page = fh.read()

                # parse the html file with bs4 so we can extract needed stuff
                soup = BeautifulSoup(html_page, "html.parser")

                # grab the conversations with the data-testid pattern
                data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
                conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

                convo_texts = []

                for i, convo in enumerate(conversations):
                    convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
                    if len(convo) > 0:
                        role = convo[0].get("data-message-author-role")
                        convo_texts.append({
                                "role" : role,
                                "text" : convo[0].text
                            }
                        )
                        
                code2convos[file_code] = convo_texts

        prompts = []
        answers = []
        code2prompts = dict()
        code2answers = defaultdict(list)
        for code , convos in code2convos.items():
            user_prompts = []
            for conv in convos:
                if conv["role"] == "user":
                    prompts.append(conv["text"].lower())
                    user_prompts.append(conv["text"].lower()) # Adding the lower case version of the prompt
                else:
                    answers.append(conv["text"].lower())
                    code2answers[code].append(conv["text"].lower()) # Adding the lower case version of the answer

            code2prompts[code] = user_prompts


        # mapping prompts to answers
        code2prompt_answer_pairs = defaultdict(list)

        for code in code2convos:
            for prompt, answer in zip(code2prompts[code], code2answers[code]):
                code2prompt_answer_pairs[code].append((prompt, answer))


        code2prompt_answer_pairs["0031c86e-81f4-4eef-9e0e-28037abf9883"][0]

        # Converting the dictionary to a DataFrame
        refactored_data = []
        for code, pairs in code2prompt_answer_pairs.items():
            vectorized_pairs = [(prompt.split(), answer.split()) for prompt, answer in pairs]
            refactored_data.append({'code': code, 'prompt_answer_pairs': vectorized_pairs})

        df = pd.DataFrame(refactored_data)

        scores = self.give_codes2scores()
        

        # join the scores with the df
        df = df.merge(scores, on="code")
        df = df.sort_values(by=["grade"], ascending=False)


        return df, code2prompts
    
    def give_codes2scores(self):
        # reading the scores
        scores = pd.read_csv(self.scores_path, sep=",")
        scores["code"] = scores["code"].apply(lambda x: x.strip())
        scores = scores[["code", "grade"]]
        #drop na
        scores.dropna(inplace=True)
        return scores

    def _train_naive_bayes_classifier(self):
        labeled_data_df = pd.read_csv(self.train_dataset_path, sep="\t")
        labeled_data_df['prompt'] = labeled_data_df['prompt'].str.lower()

        X_train, X_test, y_train, y_test = train_test_split(
            labeled_data_df['prompt'],
            labeled_data_df['related_question'],
            test_size=0.2,
            random_state=42
        )

        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)

        clf = MultinomialNB()
        clf.fit(X_train, y_train)

        return labeled_data_df, clf, vectorizer
    
    def _train_models_per_question(self):
        new_df = pd.DataFrame(columns=["prompt", "which_question", "grade"])

        for row in self.df.itertuples():
            for prompt, answer in row.prompt_answer_pairs:
                #removing stop words
                prompt = self._remove_stopwords(prompt)
                #convert prompt to string
                promptStr = " ".join(prompt)
                #add it to a new a new df without using append
                qNo = self._predict_question_number(promptStr)

                new_df.loc[len(new_df.index)] = [promptStr, qNo, row.grade]


        # replace Nan values with the mean of the column for column grade
        new_df["grade"].fillna((new_df["grade"].mean()), inplace=True)
        self.df = new_df.copy()


        models = {}
        evaluation = {}
        for question_number in new_df['which_question'].unique():
            question_data = new_df[new_df['which_question'] == question_number]
            X = question_data['prompt']
            y = question_data['grade']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            pipeline = make_pipeline(TfidfVectorizer(), LinearRegression())
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            models[question_number] = pipeline
            evaluation[question_number] = {'MSE': mse, 'R2': r2}

        return models, evaluation
    
    def _predict_question_number(self, prompt):
        prompt_vect = self.vectorizer.transform([prompt.lower()])
        return self.clf.predict(prompt_vect)[0]
    

    def predict_with_similarity_adjustment(self, prompt):
        prompt = self._remove_stopwords(prompt.split())
        prompt = " ".join(prompt)
        question_number = self._predict_question_number(prompt)
        pipeline = self.models.get(question_number)
        if not pipeline:
            raise ValueError(f"No model found for question number {question_number}.")
        vectorizer = pipeline.named_steps['tfidfvectorizer']
        model = pipeline.named_steps['linearregression']
        prompt_vector = vectorizer.transform([prompt])
        train_vectors = vectorizer.transform(self.df[self.df['which_question'] == question_number]['prompt'])
        similarities = cosine_similarity(prompt_vector, train_vectors)
        max_similarity = np.max(similarities)
        predicted_score = model.predict(prompt_vector)[0]
        adjusted_score = predicted_score * max_similarity
        return adjusted_score, max_similarity
    
    def predict_grades_for_multiple_prompts(self, code, prompts):
        question_weights = {
            'Q1': 0.05, 'Q2': 0.15, 'Q3': 0.05, 'Q4': 0.1,
            'Q5': 0.2, 'Q6': 0.15, 'Q7': 0.2, 'Q8': 0.1
        }

        question_scores = {
            'Q1': [], 'Q2': [], 'Q3': [], 'Q4': [], 'Q5': [], 'Q6': [], 'Q7': [], 'Q8': [],
        }

        # Predict and store scores for each prompt
        for prompt in prompts:
            score, _ = self.predict_with_similarity_adjustment(prompt)
            numeric_question_number = self._predict_question_number(prompt)
            question_number = f'Q{numeric_question_number}'
            question_scores[question_number].append(score)

        

        total_weighted_score = 0
        total_weight =0

        # Calculate weighted score for each question
        for question, scores in question_scores.items():
            if scores:
                average_score = sum(scores) / len(scores)
                question_weight = question_weights[question]
                total_weight += question_weight
                weighted_score = average_score * question_weight
                total_weighted_score += weighted_score
            #else:
                #print(f"No prompt for question {question}.")
                # Calculate average of other questions if no prompt for this question
                #average_score = sum([sum(q_scores) / len(q_scores) for q_scores in question_scores.values() if (len(q_scores) > 1 )]) / len([q_scores for q_scores in question_scores.values() if (len(q_scores) > 1 )])
        

        if total_weight == 0:
            print("HTML PAGE IS 404 FOR THE HTML CODE: ", code)
            return -1
        else:
            unweighted_score = total_weighted_score / total_weight
            total_weighted_score += (1-total_weight) * unweighted_score

            

        return total_weighted_score


In [343]:

ta = TextAnalysis("data/html/*.html", "data/scores.csv", "data/labeled_data/train_dataset.csv")

100%|██████████| 127/127 [00:06<00:00, 20.27it/s]


In [344]:
code2promptsTemp = ta.code2prompts.copy()
code2scores = ta.give_codes2scores().copy()
actual_scores = []
predicted_scores = []

counter = 0
for code, prompts in code2promptsTemp.items():
    predicted_score = ta.predict_grades_for_multiple_prompts(code, prompts)
    if (predicted_score != -1):
        matching_row = code2scores[code2scores['code'] == code]
        if not matching_row.empty:
            actual_score = matching_row['grade'].iloc[0]
            # Append the actual and predicted scores to their respective lists
            actual_scores.append(actual_score)
            predicted_scores.append(predicted_score)
            #if absolute difference is greater than 10, print it
            if abs(actual_score - predicted_score) > 10:
                counter += 1
                
        else:
            print(f"Code {code} not found in code2scores")


print("\n\n\n")
print("There are ", counter, " predictions with absolute difference greater than 10")
print("There are ", len(actual_scores), " entries that are predicted, the ones are not predicted are because of 404 html page or no labeled grades for the html code")

# Calculate the evaluation metrics
mse = mean_squared_error(actual_scores, predicted_scores)
rmse = mean_squared_error(actual_scores, predicted_scores, squared=False)
mae = mean_absolute_error(actual_scores, predicted_scores)
r2 = r2_score(actual_scores, predicted_scores)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")


HTML PAGE IS 404 FOR THE HTML CODE:  139235c7-736c-4237-92f0-92e8c116832c
Code 56c6f8dd-f37c-44d2-9820-9459aa34c8af not found in code2scores
HTML PAGE IS 404 FOR THE HTML CODE:  668ad17e-0240-49f7-b5a7-d22e502554c6
HTML PAGE IS 404 FOR THE HTML CODE:  b0640e51-6879-40cb-a4f5-329f952ef99d
HTML PAGE IS 404 FOR THE HTML CODE:  da6b70d5-29f6-491a-ad46-037c77067128




There are  4  predictions with absolute difference greater than 10
There are  122  entries that are predicted, the ones are not predicted are because of 404 html page or no labeled grades for the html code
Mean Squared Error (MSE): 46.40771950767913
Root Mean Squared Error (RMSE): 6.812321154179325
Mean Absolute Error (MAE): 2.9489140918919707
R-squared (R2): 0.696626685602723


In [337]:
#### BONUS PART:
bonus_data_path = "data/bonus/*.html"

bonus_code2convos = dict()

pbar = tqdm.tqdm(sorted(list(glob(bonus_data_path))))
for path in pbar:
    # print(Path.cwd() / path)
    file_code = os.path.basename(path).split(".")[0]
    with open(path, "r", encoding="latin1") as fh:
            
        # get the file id to use it as key later on
        fid = os.path.basename(path).split(".")[0]

        # read the html file
        html_page = fh.read()

        # parse the html file with bs4 so we can extract needed stuff
        soup = BeautifulSoup(html_page, "html.parser")

        # grab the conversations with the data-testid pattern
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []

        for i, convo in enumerate(conversations):
            convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
            if len(convo) > 0:
                role = convo[0].get("data-message-author-role")
                convo_texts.append({
                        "role" : role,
                        "text" : convo[0].text
                    }
                )
                
        bonus_code2convos[file_code] = convo_texts

prompts = []
answers = []
bonus_code2prompts = dict()
code2answers = defaultdict(list)
for code , convos in bonus_code2convos.items():
    user_prompts = []
    for conv in convos:
        if conv["role"] == "user":
            prompts.append(conv["text"].lower())
            user_prompts.append(conv["text"].lower()) # Adding the lower case version of the prompt
        else:
            answers.append(conv["text"].lower())
            code2answers[code].append(conv["text"].lower()) # Adding the lower case version of the answer

    bonus_code2prompts[code] = user_prompts




### predicting scores from prompts:
    
for code, prompts in bonus_code2prompts.items():
    predicted_score = ta.predict_grades_for_multiple_prompts(code, prompts)
    if(predicted_score != -1):
        print(code+","+str(predicted_score))
    else:
        print(code+",")


100%|██████████| 188/188 [00:14<00:00, 12.60it/s]


00941713-c3a2-4d27-81dc-cd447ace4a47,50.21326246626034
00aea02f-a95a-4c04-8be3-777461732cdf,58.99425420019716
04fdb619-d902-4e98-a5e9-a8198bfe047c,70.82884758714319
05029661-f8d8-441b-9cab-3c79f28a8b26,64.2093810359247
059a146e-a37c-498f-8c0b-5a78204249cb,45.1784109814664
06376869-829c-45db-b362-721060d01e3f,57.265305851156576
0a8f26c5-8fa3-4cfc-aeaa-d1dab54cd0c6,57.14192271103594
0dd38dd7-3351-492b-9eb2-a3c8dc411251,71.3147282977284
0eab330b-8cf3-40d9-809b-644362365461,61.73190993753535
1423aa97-d790-4497-9c24-a35507f07cf9,65.30154823908924
1426ed54-fc38-4c44-86c6-c822c71db4cd,53.20619741850841
14b4fe36-dcb4-4e35-984f-18d8b9c75f94,87.91773620644501
15fa30b7-606f-458d-ab88-8fb49ff0067d,68.01893501552145
16055384-9b2c-414e-bcae-a625624351c2,78.92528521109233
17842a8d-6619-4dd5-8243-62024dba9107,59.74887898449987
1850920c-259d-4651-9cde-132de594c92a,68.72611628217815
1eca992a-5b86-4363-9826-c111d9959ac0,67.03260078513611
1ed39041-c2bb-4c6c-80d2-7b8db55269a6,75.64684726542603
1f8a54f4-a69