In [4]:
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from glob import glob
from nltk.corpus import stopwords
from pathlib import Path
from pprint import pprint
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm





stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]


data_path = "data/html/*.html"

code2convos = dict()

pbar = tqdm.tqdm(sorted(list(glob(data_path))))
for path in pbar:
    # print(Path.cwd() / path)
    file_code = os.path.basename(path).split(".")[0]
    with open(path, "r", encoding="latin1") as fh:
            
        # get the file id to use it as key later on
        fid = os.path.basename(path).split(".")[0]

        # read the html file
        html_page = fh.read()

        # parse the html file with bs4 so we can extract needed stuff
        soup = BeautifulSoup(html_page, "html.parser")

        # grab the conversations with the data-testid pattern
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []

        for i, convo in enumerate(conversations):
            convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
            if len(convo) > 0:
                role = convo[0].get("data-message-author-role")
                convo_texts.append({
                        "role" : role,
                        "text" : convo[0].text
                    }
                )
                
        code2convos[file_code] = convo_texts

prompts = []
answers = []
code2prompts = defaultdict(list)
code2answers = defaultdict(list)
for code , convos in code2convos.items():
    user_prompts = []
    for conv in convos:
        if conv["role"] == "user":
            prompts.append(conv["text"].lower())
            user_prompts.append(conv["text"].lower()) # Adding the lower case version of the prompt
        else:
            answers.append(conv["text"].lower())
            code2answers[code].append(conv["text"].lower()) # Adding the lower case version of the answer

    code2prompts[code] = user_prompts


# mapping prompts to answers
code2prompt_answer_pairs = defaultdict(list)

for code in code2convos:
    for prompt, answer in zip(code2prompts[code], code2answers[code]):
        code2prompt_answer_pairs[code].append((prompt, answer))


code2prompt_answer_pairs["0031c86e-81f4-4eef-9e0e-28037abf9883"][0]

# Converting the dictionary to a DataFrame
refactored_data = []
for code, pairs in code2prompt_answer_pairs.items():
    vectorized_pairs = [(prompt.split(), answer.split()) for prompt, answer in pairs]
    refactored_data.append({'code': code, 'prompt_answer_pairs': vectorized_pairs})

df = pd.DataFrame(refactored_data)


# reading the scores
scores = pd.read_csv("data/scores.csv", sep=",")
scores["code"] = scores["code"].apply(lambda x: x.strip())

# selecting the columns we need and we care
scores = scores[["code", "grade"]]

# join the scores with the df
df = df.merge(scores, on="code")

# adding a new column named chat_length and assign it to the size of the prompt_answer_pairs
df["chat_length"] = df["prompt_answer_pairs"].apply(lambda x: len(x))
#sort by grade
df = df.sort_values(by=["grade"], ascending=False)








labeled_data_df = pd.read_csv("data/labeled_data/train_dataset.csv", sep="\t")

labeled_data_df['prompt'] = labeled_data_df['prompt'].str.lower()

print(labeled_data_df.head())

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    labeled_data_df['prompt'],
    labeled_data_df['related_question'],
    test_size=0.2,
    random_state=42
)

# Vectorize the prompts
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

def predict_question_number(prompt, clf, vectorizer):
    prompt_vect = vectorizer.transform([prompt.lower()])
    return clf.predict(prompt_vect)[0]

new_prompt = "hi. need help machine learning class hw using palmer penguins dataset extended dataset kaggle. task build decision tree classifier."
predicted_question_number = predict_question_number(new_prompt, clf, vectorizer)
print("Predicted question number for the new prompt is:", predicted_question_number)












100%|██████████| 127/127 [00:05<00:00, 21.36it/s]


                                              prompt  related_question
0  ## 2) load training dataset (5 pts)\n\n*  read...                 1
1  ## 3) understanding the dataset & preprocessin...                 2
2  set x & y, split data (5 pts)\n\n*   shuffle t...                 3
3  features and correlations (10 pts)\n\n* correl...                 4
4  tune hyperparameters (20 pts)\n* choose 2 hype...                 5
Accuracy:  1.0
Predicted question number for the new prompt is: 6


In [5]:
new_df.head()

Unnamed: 0,prompt,which_question,grade
0,hi. need help machine learning class hw using ...,6,100.0
1,"great! go step step though. first, need unders...",2,100.0
2,missing percentage 7.7,2,100.0
3,thank you. filling null values encode categori...,2,100.0
4,"yes, proceed set x & y, split data. first need...",3,100.0


In [6]:

#init a new df from scratch
new_df = pd.DataFrame(columns=["prompt", "which_question", "grade"])

for row in df.itertuples():
    for prompt, answer in row.prompt_answer_pairs:
        #removing stop words
        prompt = remove_stopwords(prompt)
        #convert prompt to string
        promptStr = " ".join(prompt)
        #add it to a new a new df without using append
        qNo = predict_question_number(promptStr,clf,vectorizer)

        new_df.loc[len(new_df.index)] = [promptStr, qNo, row.grade]


# replace Nan values with the mean of the column for column grade
new_df["grade"].fillna((new_df["grade"].mean()), inplace=True)


def train_models_per_question(new_df):
    models = {}
    evaluation = {}
    for question_number in new_df['which_question'].unique():
        # Segment the data by question number
        question_data = new_df[new_df['which_question'] == question_number]

        # Split the data into features and target
        X = question_data['prompt']
        y = question_data['grade']

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create a text processing and regression pipeline
        pipeline = make_pipeline(TfidfVectorizer(), LinearRegression())

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Save the model and its evaluation
        models[question_number] = pipeline
        evaluation[question_number] = {'MSE': mse, 'R2': r2}

    return models, evaluation

models, evaluation = train_models_per_question(new_df)

In [10]:
def predict_with_similarity_adjustment(prompt, question_number, models, new_df):
    prompt = prompt.lower()
    #removing the stopwordstrain_vectors
    prompt = remove_stopwords(prompt.split())
    #convert prompt to string
    prompt = " ".join(prompt)

    # Retrieve the model pipeline for the given question number
    pipeline = models.get(question_number)
    if not pipeline:
        raise ValueError(f"No model found for question number {question_number}.")

    # Get the vectorizer from the pipeline
    vectorizer = pipeline.named_steps['tfidfvectorizer']
    model = pipeline.named_steps['linearregression']

    # Transform the new prompt into TF-IDF vector
    prompt_vector = vectorizer.transform([prompt])

    # Transform the training data prompts into TF-IDF vectors
    train_vectors = vectorizer.transform(new_df[new_df['which_question'] == question_number]['prompt'])

    # Compute cosine similarities
    similarities = cosine_similarity(prompt_vector, train_vectors)
    
    # Find the maximum similarity score
    max_similarity = np.max(similarities)

    # Predict the score using the regression model
    predicted_score = model.predict(prompt_vector)[0]

    # Adjust the predicted score based on similarity
    adjusted_score = predicted_score * max_similarity

    return adjusted_score, max_similarity


new_prompt = 'Tune Hyperparameters (20 pts)* Choose 2 hyperparameters to tune. You can use the Scikit learn decision tree documentation for the available hyperparameters *(Hyperparameters are listed under "Parameters" in the documentation)*. Use GridSearchCV for hyperparameter tuning, with a cross-validation value of 5. Use validation accuracy to pick the best hyper-parameter values. (15 pts)-Explain the hyperparameters you chose to tune. *(What are the hyperparameters you chose? Why did you choose them?)* (5 pts)'
question_number = predict_question_number(new_prompt, clf, vectorizer)
print(f'The predicted question number for the prompt is: {question_number}')

predicted_score, max_similarity = predict_with_similarity_adjustment(new_prompt, question_number, models, new_df)
print("predicted score is: ", predicted_score)

The predicted question number for the prompt is: 5
predicted score is:  93.41453216100302


In [15]:
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from collections import defaultdict
from glob import glob
from nltk.corpus import stopwords
from pathlib import Path
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

class TextAnalysisModel:
    a = 1
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.models = {}
        self.evaluation = {}
        self.vectorizer = TfidfVectorizer()
        self.clf = MultinomialNB()
        self.df = None
        self.vectorizer = TfidfVectorizer()

    def remove_stopwords(self, tokens):
        return [token for token in tokens if token.lower() not in self.stop_words]
    
    def predict_question_number(prompt, clf, vectorizer):
        prompt_vect = vectorizer.transform([prompt.lower()])
        return clf.predict(prompt_vect)[0]
    
    def train_question_number_classifier(self, labeled_data_path):
        labeled_data_df = pd.read_csv(labeled_data_path, sep="\t")

        labeled_data_df['prompt'] = labeled_data_df['prompt'].str.lower()

        print(labeled_data_df.head())

        # Split the data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            labeled_data_df['prompt'],
            labeled_data_df['related_question'],
            test_size=0.2,
            random_state=42
        )

        # Vectorize the prompts
        self.vectorizer = TfidfVectorizer()
        X_train = self.vectorizer.fit_transform(X_train)
        X_test = self.vectorizer.transform(X_test)


        self.clf.fit(X_train, y_train)

        # Predict the test data
        y_pred = clf.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy: ", accuracy)
    
    def insert_scores(self, scores_path):
        scores = pd.read_csv(scores_path, sep=",")
        scores["code"] = scores["code"].apply(lambda x: x.strip())

        # selecting the columns we need and we care
        scores = scores[["code", "grade"]]

        # join the scores with the df
        df = df.merge(scores, on="code")
        
    
    def read_html(self, html_files_path):
        code2convos = dict()

        pbar = tqdm.tqdm(sorted(list(glob(html_files_path))))
        for path in pbar:
            # print(Path.cwd() / path)
            file_code = os.path.basename(path).split(".")[0]
            with open(path, "r", encoding="latin1") as fh:
                    
                # get the file id to use it as key later on
                fid = os.path.basename(path).split(".")[0]

                # read the html file
                html_page = fh.read()

                # parse the html file with bs4 so we can extract needed stuff
                soup = BeautifulSoup(html_page, "html.parser")

                # grab the conversations with the data-testid pattern
                data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
                conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

                convo_texts = []

                for i, convo in enumerate(conversations):
                    convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
                    if len(convo) > 0:
                        role = convo[0].get("data-message-author-role")
                        convo_texts.append({
                                "role" : role,
                                "text" : convo[0].text
                            }
                        )
                        
                code2convos[file_code] = convo_texts

        prompts = []
        answers = []
        code2prompts = defaultdict(list)
        code2answers = defaultdict(list)
        for code , convos in code2convos.items():
            user_prompts = []
            for conv in convos:
                if conv["role"] == "user":
                    prompts.append(conv["text"].lower())
                    user_prompts.append(conv["text"].lower()) # Adding the lower case version of the prompt
                else:
                    answers.append(conv["text"].lower())
                    code2answers[code].append(conv["text"].lower()) # Adding the lower case version of the answer

            code2prompts[code] = user_prompts


        # mapping prompts to answers
        code2prompt_answer_pairs = defaultdict(list)

        for code in code2convos:
            for prompt, answer in zip(code2prompts[code], code2answers[code]):
                code2prompt_answer_pairs[code].append((prompt, answer))


        code2prompt_answer_pairs["0031c86e-81f4-4eef-9e0e-28037abf9883"][0]

        # Converting the dictionary to a DataFrame
        refactored_data = []
        for code, pairs in code2prompt_answer_pairs.items():
            vectorized_pairs = [(prompt.split(), answer.split()) for prompt, answer in pairs]
            refactored_data.append({'code': code, 'prompt_answer_pairs': vectorized_pairs})

        df = pd.DataFrame(refactored_data)


        #init a new df from scratch
        new_df = pd.DataFrame(columns=["prompt", "which_question", "grade"])

        for row in df.itertuples():
            for prompt, answer in row.prompt_answer_pairs:
                #removing stop words
                prompt = remove_stopwords(prompt)
                #convert prompt to string
                promptStr = " ".join(prompt)
                #add it to a new a new df without using append
                qNo = self.predict_question_number(promptStr,self.clf,self.vectorizer)

                new_df.loc[len(new_df.index)] = [promptStr, qNo, row.grade]


        # replace Nan values with the mean of the column for column grade
        new_df["grade"].fillna((new_df["grade"].mean()), inplace=True)
        df = new_df.copy()
        




In [None]:
# Example usage
model = TextAnalysisModel()
model.load_and_preprocess_data("data/html/*.html")
model.train_text_classifier("data/labeled_data/train_dataset.csv")

# You can then use methods of the model object to perform specific actions, like:
# model.predict_question_number("some prompt")
# model.predict_with_similarity_adjustment("some prompt", question_number, new_df)