In [1]:
import os
import re
import pandas as pd
import numpy as np
import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from glob import glob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline


class TextAnalysis:
    def __init__(self, data_path, scores_path, train_dataset_path):
        self.stop_words = set(stopwords.words('english'))
        self.data_path = data_path
        self.scores_path = scores_path
        self.train_dataset_path = train_dataset_path
        self.df, self.code2convos = self._load_and_process_data()
        self.labeled_data_df, self.clf, self.vectorizer = self._train_naive_bayes_classifier()
        self.models, self.evaluation = self._train_models_per_question()

    def _remove_stopwords(self, tokens):
        return [token for token in tokens if token.lower() not in self.stop_words]

    def _load_and_process_data(self):
        code2convos = dict()

        pbar = tqdm.tqdm(sorted(list(glob(self.data_path))))
        for path in pbar:
            # print(Path.cwd() / path)
            file_code = os.path.basename(path).split(".")[0]
            with open(path, "r", encoding="latin1") as fh:
                    
                # get the file id to use it as key later on
                fid = os.path.basename(path).split(".")[0]

                # read the html file
                html_page = fh.read()

                # parse the html file with bs4 so we can extract needed stuff
                soup = BeautifulSoup(html_page, "html.parser")

                # grab the conversations with the data-testid pattern
                data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
                conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

                convo_texts = []

                for i, convo in enumerate(conversations):
                    convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
                    if len(convo) > 0:
                        role = convo[0].get("data-message-author-role")
                        convo_texts.append({
                                "role" : role,
                                "text" : convo[0].text
                            }
                        )
                        
                code2convos[file_code] = convo_texts

        prompts = []
        answers = []
        code2prompts = defaultdict(list)
        code2answers = defaultdict(list)
        for code , convos in code2convos.items():
            user_prompts = []
            for conv in convos:
                if conv["role"] == "user":
                    prompts.append(conv["text"].lower())
                    user_prompts.append(conv["text"].lower()) # Adding the lower case version of the prompt
                else:
                    answers.append(conv["text"].lower())
                    code2answers[code].append(conv["text"].lower()) # Adding the lower case version of the answer

            code2prompts[code] = user_prompts


        # mapping prompts to answers
        code2prompt_answer_pairs = defaultdict(list)

        for code in code2convos:
            for prompt, answer in zip(code2prompts[code], code2answers[code]):
                code2prompt_answer_pairs[code].append((prompt, answer))


        code2prompt_answer_pairs["0031c86e-81f4-4eef-9e0e-28037abf9883"][0]

        # Converting the dictionary to a DataFrame
        refactored_data = []
        for code, pairs in code2prompt_answer_pairs.items():
            vectorized_pairs = [(prompt.split(), answer.split()) for prompt, answer in pairs]
            refactored_data.append({'code': code, 'prompt_answer_pairs': vectorized_pairs})

        df = pd.DataFrame(refactored_data)


        # reading the scores
        scores = pd.read_csv("data/scores.csv", sep=",")
        scores["code"] = scores["code"].apply(lambda x: x.strip())

        # selecting the columns we need and we care
        scores = scores[["code", "grade"]]

        # join the scores with the df
        df = df.merge(scores, on="code")
        df = df.sort_values(by=["grade"], ascending=False)


        return df, code2convos

    def _train_naive_bayes_classifier(self):
        labeled_data_df = pd.read_csv(self.train_dataset_path, sep="\t")
        labeled_data_df['prompt'] = labeled_data_df['prompt'].str.lower()

        X_train, X_test, y_train, y_test = train_test_split(
            labeled_data_df['prompt'],
            labeled_data_df['related_question'],
            test_size=0.2,
            random_state=42
        )

        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)

        clf = MultinomialNB()
        clf.fit(X_train, y_train)

        return labeled_data_df, clf, vectorizer
    
    def _train_models_per_question(self):
        new_df = pd.DataFrame(columns=["prompt", "which_question", "grade"])

        for row in self.df.itertuples():
            for prompt, answer in row.prompt_answer_pairs:
                #removing stop words
                prompt = self._remove_stopwords(prompt)
                #convert prompt to string
                promptStr = " ".join(prompt)
                #add it to a new a new df without using append
                qNo = self._predict_question_number(promptStr)

                new_df.loc[len(new_df.index)] = [promptStr, qNo, row.grade]


        # replace Nan values with the mean of the column for column grade
        new_df["grade"].fillna((new_df["grade"].mean()), inplace=True)
        self.df = new_df.copy()


        models = {}
        evaluation = {}
        for question_number in new_df['which_question'].unique():
            question_data = new_df[new_df['which_question'] == question_number]
            X = question_data['prompt']
            y = question_data['grade']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            pipeline = make_pipeline(TfidfVectorizer(), LinearRegression())
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            models[question_number] = pipeline
            evaluation[question_number] = {'MSE': mse, 'R2': r2}

        return models, evaluation
    
    def _predict_question_number(self, prompt):
        prompt_vect = self.vectorizer.transform([prompt.lower()])
        return self.clf.predict(prompt_vect)[0]
    

    def predict_with_similarity_adjustment(self, prompt):
        prompt = self._remove_stopwords(prompt.split())
        prompt = " ".join(prompt)
        question_number = self._predict_question_number(prompt)
        pipeline = self.models.get(question_number)
        if not pipeline:
            raise ValueError(f"No model found for question number {question_number}.")
        vectorizer = pipeline.named_steps['tfidfvectorizer']
        model = pipeline.named_steps['linearregression']
        prompt_vector = vectorizer.transform([prompt])
        train_vectors = vectorizer.transform(self.df[self.df['which_question'] == question_number]['prompt'])
        similarities = cosine_similarity(prompt_vector, train_vectors)
        max_similarity = np.max(similarities)
        predicted_score = model.predict(prompt_vector)[0]
        adjusted_score = predicted_score * max_similarity
        return adjusted_score, max_similarity


In [2]:

ta = TextAnalysis("data/html/*.html", "data/scores.csv", "data/labeled_data/train_dataset.csv")

100%|██████████| 127/127 [00:05<00:00, 21.80it/s]


In [3]:
import os
import re
from bs4 import BeautifulSoup
from collections import defaultdict

def extract_prompts_from_html(file_name, data_path="data/html"):
    file_path = os.path.join(data_path, file_name)
    
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return []
    
    code2convos = dict()
    prompts = []

    with open(file_path, "r", encoding="latin1") as fh:
        # Extract file code
        file_code = os.path.basename(file_path).split(".")[0]

        # Read and parse the HTML file
        html_page = fh.read()
        soup = BeautifulSoup(html_page, "html.parser")

        # Pattern to identify conversation elements
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []
        for convo in conversations:
            # Extract messages from the conversation
            msgs = convo.find_all("div", attrs={"data-message-author-role": re.compile(r"(user|assistant)")})
            for msg in msgs:
                role = msg.get("data-message-author-role")
                convo_texts.append({"role": role, "text": msg.text})

        # Store conversations
        code2convos[file_code] = convo_texts

    # Separating prompts and answers
    for code, convos in code2convos.items():
        for conv in convos:
            if conv["role"] == "user":
                prompts.append(conv["text"].lower())

    return prompts

# Example usage
file_name = "0c95c563-f1e1-4a35-844c-2e8ccdd1b161.html"
prompts = extract_prompts_from_html(file_name)
print(prompts)  # This will print the list of prompts extracted from the specified HTML file.


['hello! i want to make your help on my homework about machine learning with python usage. we will go section by section firstly i want to read a csv file with the pandas library in the given path /content/cs412_hw1_dataset.csv ', 'i think you understood me wrong. i want you to generate a code that read a csv file with pandas library in the given path /content/cs412_hw1_dataset.csv', 'now lets understand this data set. first we need to find the shape of the dataset with shape function. then we have to display variable names(both dependent and independent). then we have to display the summary of dataset with info function.  finally, display the first 5 rows from training dataset. (hint: you can use the head function)', 'now lets go for another. i want to check if there any missing values in my dataset. if there is i want to fill them with the most common value technuqiue. after that i want to encode categorical labels with the mappings given in the cell below. (hint: you can use map fun

In [4]:
import pandas as pd
from collections import defaultdict

def print_predicted_and_real_grade(html_file_name, ta, scores_file_path="data/scores.csv"):
    # Assuming ta is your instance of TextAnalysis
    question_weights = {
        'Q1': 0.05, 'Q2': 0.15, 'Q3': 0.05, 'Q4': 0.1,
        'Q5': 0.2, 'Q6': 0.15, 'Q7': 0.2, 'Q8': 0.1
    }

    # Extract prompts from HTML file
    prompts = extract_prompts_from_html(html_file_name)

    # Calculate predicted score
    question_scores = defaultdict(list)
    for prompt in prompts:
        predicted_score, max_similarity = ta.predict_with_similarity_adjustment(prompt)
        question_number = ta._predict_question_number(prompt)
        weighted_score = predicted_score * question_weights.get(f'Q{question_number}', 1)
        question_scores[question_number].append(weighted_score)

    total_score = 0
    for question, scores in question_scores.items():
        if scores:
            average_score = sum(scores) / len(scores)
            total_score += average_score

    print("Predicted Grade:", total_score)

    # Extract the code from the HTML file name
    file_code = html_file_name.split(".")[0]

    # Read the scores.csv file
    scores_df = pd.read_csv(scores_file_path)

    # Find the grade for the extracted code
    grade_row = scores_df[scores_df['code'] == file_code]

    # Print the real grade
    if not grade_row.empty:
        grade = grade_row['grade'].iloc[0]
        print(f"Real Grade {html_file_name}: {grade}")
    else:
        print(f"No grade found for {html_file_name}")

    return total_score, grade if not grade_row.empty else None





# Example usage
html_file_name = "0c95c563-f1e1-4a35-844c-2e8ccdd1b161.html"
print_predicted_and_real_grade(html_file_name, ta)


Predicted Grade: 86.63078508164214
Real Grade 0c95c563-f1e1-4a35-844c-2e8ccdd1b161.html: 96.0


(86.63078508164214, 96.0)

In [5]:
import os
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def evaluate_model(directory, ta):
    predicted_grades = []
    real_grades = []

    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            # Pass only the filename, not the full path
            result = print_predicted_and_real_grade(filename, ta)

            if result is not None:
                predicted_grade, real_grade = result
                if real_grade is not None:
                    predicted_grades.append(predicted_grade)
                    real_grades.append(real_grade)

    # Calculate evaluation metrics
    real_grades = [rg for rg, pg in zip(real_grades, predicted_grades) if not np.isnan(rg) and not np.isnan(pg)]
    predicted_grades = [pg for rg, pg in zip(real_grades, predicted_grades) if not np.isnan(rg) and not np.isnan(pg)]
                    
    mse = mean_squared_error(real_grades, predicted_grades)
    mae = mean_absolute_error(real_grades, predicted_grades)
    r2 = r2_score(real_grades, predicted_grades)

    return mse, mae, r2

# Example usage
directory = "data/html"



mse, mae, r2 = evaluate_model(directory, ta)
print(f"MSE: {mse}, MAE: {mae}, R-squared: {r2}")


Predicted Grade: 76.4669752495959
Real Grade b73f91f8-732f-4a48-bcbd-eadbbb457a94.html: 94.0
Predicted Grade: 9.900023451604111
Real Grade 746b8f06-1e89-43b8-b73c-1121eecfc854.html: 99.0
Predicted Grade: 88.6175622210682
Real Grade 30283b91-7fc3-4125-985b-b441f0f489d6.html: 99.0
Predicted Grade: 83.26034546519537
Real Grade ef5b3fbc-f5d2-4446-bb4f-7d8b2a3026e9.html: 85.0
Predicted Grade: 84.01149667465069
Real Grade 106ffe99-c787-4d09-9076-4ba411eb68b1.html: 84.0
Predicted Grade: 33.881987932880534
Real Grade 65ea56c3-e205-4ed9-8b85-bd1876228cee.html: 84.0
Predicted Grade: 86.63078508164214
Real Grade 0c95c563-f1e1-4a35-844c-2e8ccdd1b161.html: 96.0
Predicted Grade: 89.76624111496253
Real Grade 22bb7162-3399-464a-b30b-cf1fc3210b4e.html: 96.0
Predicted Grade: 83.86547888269939
Real Grade ba18e4e8-2c26-46d4-ba31-cc21947aabd5.html: 100.0
Predicted Grade: 34.054266531352
Real Grade 2446216c-c557-4ee8-b470-7e2ae3c88968.html: 98.0
Predicted Grade: 91.19638823113763
Real Grade 8a84e6e5-d200-4c

In [6]:
new_prompt = 'Tune Hyperparameters'
predicted_score, max_similarity = ta.predict_with_similarity_adjustment(new_prompt)
print("Predicted Score:", predicted_score, "Max Similarity:", max_similarity)

Predicted Score: 91.00029353181002 Max Similarity: 1.0
