In [None]:
import os
import re
import os
import tqdm
from glob import glob
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint
import graphviz

from collections import defaultdict
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
#dev-main

In [None]:
def get_code(black_part, code_list, convo):
    if len(black_part) > 0:
        # First delete top bar
        black_part.contents[0].decompose()
        # Then extract code text
        code = black_part.text
        splitted_code = code.split("\n")
        new_code = []
        for i in splitted_code:
            if len(i) != 0 and i[0] != "#":
                new_code.append(i)
        code = "\n".join(new_code)
                
        code_list.append(code)
        convo[0].contents[0].find_all("pre")[0].decompose()


In [None]:
data_path = "data/html/*.html"

code2convos = dict()

pbar = tqdm.tqdm(sorted(list(glob(data_path))))
for path in pbar:
    # print(Path.cwd() / path)
    file_code = os.path.basename(path).split(".")[0]
    with open(path, "r", encoding="latin1") as fh:
            
        # get the file id to use it as key later on
        fid = os.path.basename(path).split(".")[0]

        # read the html file
        html_page = fh.read()

        # parse the html file with bs4 so we can extract needed stuff
        soup = BeautifulSoup(html_page, "html.parser")

        # grab the conversations with the data-testid pattern
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []

        for i, convo in enumerate(conversations):
            convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
            if len(convo) > 0:
                # Search for code part of conversation
                black_part = convo[0].find_all("div", class_= "bg-black rounded-md")

                role = convo[0].get("data-message-author-role")
                code = ""
                # If there is a code part
                if len(black_part) > 0:
                    # First delete top bar
                    black_part[0].contents[0].decompose()
                    # Then extract code text
                    code = black_part[0].text
                    splitted_code = code.split("\n")
                    new_code = []
                    for i in splitted_code:
                        if len(i) != 0 and i[0] != "#":
                            new_code.append(i)
                    code = "\n".join(new_code)
                    # At the end delete code from text to create seperate things
                    convo[0].contents[0].find_all("pre")[0].decompose()
                
                convo_texts.append({
                        "role" : role,
                        "text" : convo[0].text,
                        "code" : code
                    }
                )
                
        code2convos[file_code] = convo_texts

In [None]:
#code2convos

In [None]:
# let's see one of the conversations
pprint(code2convos["f2f18684-4a16-4c05-a2d1-c0f96d1de869"][0])

In [None]:
# let's see one of the conversations
pprint(code2convos["f2f18684-4a16-4c05-a2d1-c0f96d1de869"][1])

### Preprocess text data before feature engineering

In [None]:
# helper function for later use
def convert_list_to_str(my_list):
 
    list_to_str = ' '.join([str(elem) for i,elem in enumerate(my_list)])
    
    return list_to_str

In [None]:
def remove_html_tags(text):
    # Create a BeautifulSoup object
    soup = BeautifulSoup(text, "html.parser")

    # Extract text content without HTML tags
    clean_text = soup.get_text()

    return clean_text



In [None]:
"""for history, conversations in code2convos.items():
    for conversation in conversations:
        conversation["text"] = remove_html_tags(conversation["text"])
        conversation["code"] = [remove_html_tags(line) for line in conversation["code"]]"""

In [None]:
def remove_non_ascii(text):
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

for code, convs in code2convos.items():
    for c in convs:
        c["text"] = remove_non_ascii(c["text"])

In [None]:
invalid_chats = []
for code, convs in code2convos.items():
    if len(convs) == 0:
        invalid_chats.append(code)
        

In [None]:
for chat_id in invalid_chats:
    print("deleting ", chat_id)
    del code2convos[chat_id]

#### Preprocess scores dataset

In [None]:
# reading the scores
scores = pd.read_csv("scores.csv", sep=",")
scores

In [None]:
duplicate = scores[scores.duplicated('code')]
duplicate

In [None]:
scores = scores.drop_duplicates(subset=['code'])
scores

In [None]:
scores.info()

In [None]:
row_to_drop = scores[scores['grade'].isna()].copy()
print(row_to_drop)
scores.dropna(subset=['grade'], inplace=True)

In [None]:
del code2convos["56c6f8dd-f37c-44d2-9820-9459aa34c8af"]

In [None]:
condition = scores["code"].isin(invalid_chats)
# delete those rows
scores.drop(scores[condition].index, inplace=True)

In [None]:
scores.info()

#### Things to do:
- Prompt matching with questions
- Feature Engineering
- Question Grades preparation
- Train/Test split
- Fitting a model for predicting the scores

#### Prompt Matching
> We want to match the prompts with the questions in the Homework Let's
> do it with a simple term frequency vectorizing method. For each prompt,
> we will come with a vector that represents it. We will do the same
> thing with each of the homework questions. Then, we will calculate the
> vectors distanance to do the matching

In [None]:
prompts = []
code2prompts = defaultdict(list)
for code , convos in code2convos.items():
    user_prompts = []
    for conv in convos:
        if conv["role"] == "user":
            prompts.append(conv["text"])
            user_prompts.append(conv["text"])
    code2prompts[code] = user_prompts    

In [None]:
prompts[0]

In [None]:
questions = [
    """Initialize
*   First make a copy of the notebook given to you as a starter.
*   Make sure you choose Connect form upper right.
*   You may upload the data to the section on your left on Colab, than right click on the .csv file and get the path of the file by clicking on "Copy Path". You will be using it when loading the data.

""",
#####################
    """Load training dataset (5 pts)
    *  Read the .csv file with the pandas library
""",
#####################
"""Understanding the dataset & Preprocessing (15 pts)
Understanding the Dataset: (5 pts)
> - Find the shape of the dataset (number of samples & number of attributes). (Hint: You can use the **shape** function)
> - Display variable names (both dependent and independent).
> - Display the summary of the dataset. (Hint: You can use the **info** function)
> - Display the first 5 rows from training dataset. (Hint: You can use the **head** function)
Preprocessing: (10 pts)

> - Check if there are any missing values in the dataset. If there are, you can either drop these values or fill it with most common values in corresponding rows. **Be careful that you have enough data for training the  model.**

> - Encode categorical labels with the mappings given in the cell below. (Hint: You can use **map** function)
""",
"""Set X & y, split data (5 pts)

*   Shuffle the dataset.
*   Seperate your dependent variable X, and your independent variable y. The column health_metrics is y, the rest is X.
*   Split training and test sets as 80% and 20%, respectively.
""",
#####################
"""Features and Correlations (10 pts)

* Correlations of features with health (4 points)
Calculate the correlations for all features in dataset. Highlight any strong correlations with the target variable. Plot your results in a heatmap.

* Feature Selection (3 points)
Select a subset of features that are likely strong predictors, justifying your choices based on the computed correlations.

* Hypothetical Driver Features (3 points)
Propose two hypothetical features that could enhance the model's predictive accuracy for Y, explaining how they might be derived and their expected impact. Show the resulting correlations with target variable.

* __Note:__ You get can get help from GPT.
""",
#####################
"""Tune Hyperparameters (20 pts)
* Choose 2 hyperparameters to tune. You can use the Scikit learn decision tree documentation for the available hyperparameters *(Hyperparameters are listed under "Parameters" in the documentation)*. Use GridSearchCV for hyperparameter tuning, with a cross-validation value of 5. Use validation accuracy to pick the best hyper-parameter values. (15 pts)
-Explain the hyperparameters you chose to tune. *(What are the hyperparameters you chose? Why did you choose them?)* (5 pts)
""",
#####################
"""Re-train and plot the decision tree with the hyperparameters you have chosen (15 pts)
- Re-train model with the hyperparameters you have chosen in part 5). (10 pts)
- Plot the tree you have trained. (5 pts)
Hint: You can import the **plot_tree** function from the sklearn library.
""",
#####################
"""Test your classifier on the test set (20 pts)
- Predict the labels of testing data using the tree you have trained in step 6. (10 pts)
- Report the classification accuracy. (2 pts)
- Plot & investigate the confusion matrix. Fill the following blanks. (8 pts)
> The model most frequently mistakes class(es) _________ for class(es) _________.
Hint: You can use the confusion_matrix function from sklearn.metrics
""",
#####################
"""Find the information gain on the first split (10 pts)""",
#####################
]

In [None]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(prompts + questions)

In [None]:
questions_TF_IDF = pd.DataFrame(vectorizer.transform(questions).toarray(), columns=vectorizer.get_feature_names_out())
questions_TF_IDF.head()

In [None]:
code2prompts_tf_idf = dict()
for code, user_prompts in code2prompts.items():
    if len(user_prompts) == 0:
        # some files have issues
        print(code+".html")
        continue
    prompts_TF_IDF = pd.DataFrame(vectorizer.transform(user_prompts).toarray(), columns=vectorizer.get_feature_names_out())
    code2prompts_tf_idf[code] = prompts_TF_IDF

In [None]:
code2prompts_tf_idf["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].head()

In [None]:
code2prompts_tf_idf["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].shape

In [None]:

code2prompts_tf_idf["f2f18684-4a16-4c05-a2d1-c0f96d1de869"].head()

In [None]:
code2prompts_tf_idf["f2f18684-4a16-4c05-a2d1-c0f96d1de869"].shape

In [None]:
code2cosine = dict()
for code, user_prompts_tf_idf in code2prompts_tf_idf.items():
    code2cosine[code] = pd.DataFrame(cosine_similarity(questions_TF_IDF,user_prompts_tf_idf))

In [None]:
questions[4]

In [None]:
code2questionmapping = dict()
for code, cosine_scores in code2cosine.items():
    code2questionmapping[code] = code2cosine[code].max(axis=1).tolist()


question_mapping_scores = pd.DataFrame(code2questionmapping).T
question_mapping_scores.reset_index(inplace=True)
question_mapping_scores.rename(columns={i: f"Q_{i}" for i in range(len(questions))}, inplace=True)
question_mapping_scores.rename(columns={"index" : "code"}, inplace=True)

question_mapping_scores

# Feature Engineering
- Number of prompts that a uers asked
- Number of complaints that a user makes e.g "the code gives this error!"
- User prompts average number of characters

In [None]:
# import necessary libraries
import nltk
nltk.download("punkt")

In [None]:
code2features = defaultdict(lambda : defaultdict(int))

keywords2search = ["error", "no", "thank", "next", "Entropy"]
keywords2search = [k.lower() for k in keywords2search]

for code, convs in code2convos.items():
    for c in convs:
        text = c["text"].lower()
        if c["role"] == "user":
            # User Prompts

            # count the user prompts
            code2features[code]["#user_prompts"] += 1
            
            # count the keywords
            for kw in keywords2search:
                code2features[code][f"#{kw}"] +=  len(re.findall(rf"\b{kw}\b", text))

            code2features[code]["prompt_avg_chars"] += len(text)
        else:
            # ChatGPT Responses
            code2features[code]["response_avg_chars"] += len(text)

        code2features[code]["prompt_avg_chars"] /= code2features[code]["#user_prompts"]   
        code2features[code]["response_avg_chars"] /= code2features[code]["#user_prompts"]

New features
- #_hw_statements --> Useful to check if the student mentioned about the tass are for a homework in a Machine Learning course. ChatGPT may act accordingly when she knows the tasks will be presented in academy.

- #_question_statements --> The way the student behaves might affect the responses of ChatGPT.

- #_understand_statements --> If a students is eager to learn, it is more likely that she will use verbs like "explain". The more willingness to learn, the more likely to get a high grade. 

In [None]:
#Add new features
code2newfeatures = defaultdict(lambda : defaultdict(int))
hw_statements = ["cs412", "machine learning", "course", "ML", "homework","412","cs", "assignment"]
question_statements = ["how", "why", "what", "where", "can you", "do you"]
understand_statements = ["explain", "reason", "example", "instance","demonstrate","describe", "proof" ,"prove","show", "I think"]
example_statements = ["for example", "for instance", "ex:", "here is an example", "see this", "like this", "similar to"]

hw_statements = [k.lower() for k in hw_statements]
question_statements = [q.lower() for q in question_statements]
understand_statements = [u.lower() for u in understand_statements]
example_statements = [n.lower() for n in example_statements]

for code, convs in code2convos.items():
    for c in convs:
        text = c["text"].lower()
        if c["role"] == "user":
            # User Prompts
                        
            # count the keywords
            for kw in hw_statements:
                code2newfeatures[code][f"#_hw_statements"] +=  len(re.findall(rf"\b{kw}\b", text))
            for qw in question_statements:
                code2newfeatures[code][f"#_question_statements"] +=  len(re.findall(rf"\b{qw}\b", text))
            for uw in understand_statements:
                code2newfeatures[code][f"#_understand_statements"] +=  len(re.findall(rf"\b{uw}\b", text))
            for nw in example_statements:
                code2newfeatures[code][f"#_example_statements"] +=  len(re.findall(rf"\b{uw}\b", text))

In [None]:
#Merge new features into features df
for code, features in code2newfeatures.items():
    for feature, value in features.items():
        code2features[code][feature] += value

In [None]:
df = pd.DataFrame(code2features).T
df.head(5)

- Average code length char by char

In [None]:
# count average code length in terms of number of chars in the code part of a response of ChatGPT

for code, convs in code2convos.items():
    num = 0
    total_sum = 0
    for c in convs:
        if c["role"] == "assistant":
            num += 1
            code_length_sum  = 0
            for line in c["code"]:
                code_length_sum  += len(line)
            total_sum += code_length_sum 
    code2features[code]["avg_code_length"] = (total_sum/num)
    code2features[code]["code_response_ratio"] = code2features[code]["avg_code_length"] / code2features[code]["response_avg_chars"]
    

In [None]:
#Finding number of imperative sentences given by the student
import spacy
import re

nlp = spacy.load("en_core_web_sm")  # Load the English NLP model once

def is_imperative(text):

    doc = nlp(text)

    # Check for common characteristics of imperative sentences:
    if len(doc) > 0:
        first_token = doc[0]

        # 1. Verb in imperative form at the beginning
        if first_token.pos_ == "VERB" and first_token.tag_ in ("VB", "VBP"):

            # 2. No subject or explicit subject
            if not first_token.dep_ or first_token.dep_ not in ("nsubj", "nsubjpass"):

                # 3. Handle exceptions and potential false positives:
                if not (first_token.text.lower() in ("let's", "let's") and doc[1].dep_ == "obj"):  # Exclude "Let's" suggestions
                    return True

    return False



nlp = spacy.load("en_core_web_sm")  # Load the English NLP model once for efficiency


for code, convos in code2convos.items():
    imperative_sentence_count = 0

    for conv in convos:
        if conv["role"] == "user":
            text = conv["text"]
        

            # Check for imperative sentences
            sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", text)  # Split text into sentences
            for sentence in sentences:
                if is_imperative(sentence):
                    imperative_sentence_count += 1

    code2features[code]["imperative_sentence_count"] =  imperative_sentence_count




In [None]:
#for errors common word is "Traceback", we want to get # of errors that user dealt with

for code, convos in code2convos.items():
    traceback_count = 0

    for conv in convos:
        if conv["role"] == "user":
            text = conv["text"]
            traceback_count += text.count("Traceback")  # Count occurrences in user text

    code2features[code]["#given_errors"] = traceback_count





In [None]:
df = pd.DataFrame(code2features).T
df.head(5)


In [None]:
#for students who get negative information gain and try to solve it

for code, convos in code2convos.items():
    neg_info_gain = False

    for conv in convos:
        if conv["role"] == "user":
            text = conv["text"]
            if (text.count("negative") > 0 and text.count("information gain") > 0):
                neg_info_gain = True
           

    code2features[code]["is_info_gain_negative"] = neg_info_gain

- total number of sentences in prompts
- average number of sentences in prompts

In [None]:
# count the number of sentences in a prompt

def count_sentences(text):
    
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Return the count of sentences
    return len(sentences)



In [None]:
# example usage

text = """My dataset is 3425x11. How should I set max_depth and min_samples_split lists? # param_grid represents the hyperparameters we want to try (our search space)
param_grid = {
    'max_depth': [3, 5, 8, 12, 16],
    'min_samples_split': [5, 8, 14, 20]
}

# estimator is the model we are evaluating, Decision Tree in our case
estimator = DecisionTreeClassifier(criterion='entropy', random_state=42)

# scoring is the score used to choose the best model
scoring='accuracy'

# cv is the number of folds to use for cross validation
cv = 5

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring=scoring,
    cv=cv)

grid_search.fit(X_train, y_train)"""

sentence_count = count_sentences(text)
print(f"Number of sentences: {sentence_count}")

##### Although the number of chars is quite high, the number of sentences is small. This is because the last part is actually a code snippet. 

In [None]:
for code, convs in code2convos.items():
    sentence_count = 0
    num = 0
    for c in convs:
        if c["role"] == "user":
            num += 1
            sentence_count += count_sentences(c["text"])
    code2features[code]["avg_sentences_in_prompts"] = (sentence_count/num)
    code2features[code]["#sentences_in_prompts"] = sentence_count


In [None]:
df = pd.DataFrame(code2features).T
df.head(5)

##### We observed that some students provide the dataset to ChatGPT. We can check whether it has an impact on the grade. If it is provided, it is done in the top 10 prompts. Let's check them

In [None]:
dataset_keywords = ["Island where the penguin was found (Biscoe, Dream, Torgensen)", "cs412_hw1_dataset.csv", "https://www.kaggle.com/datasets/samybaladram/palmers-penguin-dataset-extended/data", "species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,diet,life_stage,health_metrics,year"]


for code, convs in code2convos.items():
    is_dataset_mentioned = any(
        any(keyword in conv["text"] for keyword in dataset_keywords)
        for conv in convs[:10] if conv["role"] == "user"
    )
    code2features[code]["is_dataset_given"] = is_dataset_mentioned


In [None]:
df = pd.DataFrame(code2features).T

In [None]:
print(df["is_dataset_given"].value_counts())
print("--------------------------------------")
df.head(5)

- Checking diverse similarities

In [None]:
# get the provided codes from ChatGPT to the students who took 100 points

stu_w_100 = scores[scores["grade"] == 100]
stu_w_100

In [None]:
stu_w_100_codes = {}
for student, convs in code2convos.items():
    if student in stu_w_100["code"].values:
        student_codes = []
        for c in convs:
            if c["role"] == "assistant" and (len(c["code"]) > 0):
                student_codes.append(c["code"])
        stu_w_100_codes[student] = student_codes
print(len(stu_w_100_codes))

In [None]:
stu_codes = {}
for student, convs in code2convos.items():
    student_codes = []
    for c in convs:
        if c["role"] == "assistant" and (len(c["code"]) > 0):
            student_codes.append(c["code"])
    stu_codes[student] = student_codes
print(len(stu_codes))


In [None]:
#stu_codes["fb8de815-224c-4d06-9fd4-7156d1a9920d"]

In [None]:
stu_w_100_codes_list = []
for student in stu_w_100_codes:
    stu_w_100_codes_list.append(convert_list_to_str(stu_w_100_codes[student]))

print(len(stu_w_100_codes_list))
#print(stu_w_100_codes_list)

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_stu_w_100_codes = tfidf_vectorizer.fit_transform(stu_w_100_codes_list)

code_responses_similarity = {}
for student, code_respsonses in stu_codes.items():
    cur_stu_tf_idf = tfidf_vectorizer.transform([convert_list_to_str(stu_codes[student])])
    
    # Calculate cosine similarity with the TF-IDF of conversations who scored 100
    similarity_scores = cosine_similarity(cur_stu_tf_idf, tfidf_matrix_stu_w_100_codes)
    
    # We take the average similarity score for simplicity
    code_responses_similarity[student] = similarity_scores.mean()

similarity_with_stu_w_100_codes_df = pd.DataFrame.from_dict(code_responses_similarity, orient='index', columns=['similarity_with_stu_w_100_codes'])


similarity_with_stu_w_100_codes_df = similarity_with_stu_w_100_codes_df.reset_index()

similarity_with_stu_w_100_codes_df.columns = ['student', 'similarity_with_stu_w_100_codes']

similarity_with_stu_w_100_codes_df.head(10)


In [None]:

scores.drop(columns=['Unnamed: 0'], inplace=True)#Cause conflict when merging with df
scores

In [None]:
df = df.reset_index()
df = df.rename(columns={"index": "code"})

In [None]:
df

In [None]:

temp_df = pd.merge(df, scores, on='code', how="left")
temp_df.dropna(inplace=True)
temp_df.drop_duplicates("code",inplace=True, keep="first")

#temp_df=df

temp_df.head()

Playground for Similarity with the ones who got 100

In [None]:
#tf-ıdf vectorizer with only user prompts
conversations_100 = []
for code, convos in code2convos.items():
    # Check if the code exists in temp_df and if the student scored 100
    if code in temp_df['code'].values and temp_df.loc[temp_df['code'] == code, 'grade'].values[0] == 100:
        convo_text = " ".join(c["text"] for c in convos if c["role"] == "user")
        conversations_100.append(convo_text)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_100 = tfidf_vectorizer.fit_transform(conversations_100)

code2similarity = {}
for code, convos in code2convos.items():
    # Extract the text for the current student's conversation
    student_convo = " ".join(c["text"] for c in convos if c["role"] == "user")
    
    # Vectorize the current student's conversation using the TF-IDF model created earlier
    student_tfidf = tfidf_vectorizer.transform([student_convo])
    
    # Calculate cosine similarity with the TF-IDF of conversations who scored 100
    similarity_scores = cosine_similarity(student_tfidf, tfidf_matrix_100)
    
    # We take the average similarity score for simplicity
    code2similarity[code] = similarity_scores.mean()

# Step 4: Store these similarities in a new DataFrame
similarity_df = pd.DataFrame.from_dict(code2similarity, orient='index', columns=['similarity_with_100'])


similarity_df = similarity_df.reset_index()

# Rename columns appropriately
similarity_df.columns = ['code', 'similarity_with_100']

# Now similarity_df_reset has a regular index and separate columns for 'code' and 'similarity_with_100'
similarity_df.head()


In [None]:
#tf-ıdf vectorizer with user and assistant convos
conversations_100 = []
for code, convos in code2convos.items():
    # Check if the code exists in temp_df and if the student scored 100
    if code in temp_df['code'].values and temp_df.loc[temp_df['code'] == code, 'grade'].values[0] == 100:
        convo_text = " ".join(c["text"] for c in convos if c["role"] == "user" or c["role"] == "assistant")
        conversations_100.append(convo_text)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_100 = tfidf_vectorizer.fit_transform(conversations_100)

code2similarity = {}
for code, convos in code2convos.items():
    # Extract the text for the current student's conversation
    student_convo = " ".join(c["text"] for c in convos if c["role"] == "user" or c["role"] == "assistant")
    
    # Vectorize the current student's conversation using the TF-IDF model created earlier
    student_tfidf = tfidf_vectorizer.transform([student_convo])
    
    # Calculate cosine similarity with the TF-IDF of conversations who scored 100
    similarity_scores = cosine_similarity(student_tfidf, tfidf_matrix_100)
    
    # We take the average similarity score for simplicity
    code2similarity[code] = similarity_scores.mean()

# Step 4: Store these similarities in a new DataFrame
similarity_df2 = pd.DataFrame.from_dict(code2similarity, orient='index', columns=['similarity_with_100'])

similarity_df2.head(100)

similarity_df2 = similarity_df2.reset_index()

# Rename columns appropriately
similarity_df2.columns = ['code', 'similarity_with_100']

# Now similarity_df_reset has a regular index and separate columns for 'code' and 'similarity_with_100'
similarity_df2.head()

In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import numpy as np
import pandas as pd
import re

# Step 1: Prepare the data for Word2Vec
# Tokenize the conversations
tokenized_conversations = []
for convos in code2convos.values():
    for c in convos:
        #if c['role'] == 'user' or c['role'] == 'assistant':  # Assuming you want to include only user's text
            # Tokenize the text and add to the list
        tokenized_conversations.append(c['text'].lower().split())

# Step 2: Train a Word2Vec model
# Here we are training a model on the tokenized conversations
word2vec_model = Word2Vec(sentences=tokenized_conversations, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Calculate the mean vector for conversations of students who scored 100
# First, get the conversations for codes where the grade is 100
codes_with_grade_100 = temp_df[temp_df['grade'] == 100]['code']
conversations_100 = []

for code in codes_with_grade_100:
    convos = code2convos[code]
    for c in convos:
        if c['role'] == 'user':
            conversations_100.extend(c['text'].lower().split())

# Tokenize the conversations and filter out words not in the model's vocabulary
conversations_100 = [word for word in conversations_100 if word in word2vec_model.wv]

# Calculate the mean vector for these words
mean_vector_100 = np.mean([word2vec_model.wv[word] for word in conversations_100], axis=0)

# Step 4: Compare each student's conversation with the mean vector
code2similarity = defaultdict(float)

for code, convos in code2convos.items():
    all_words = []
    for c in convos:
        if c['role'] == 'user':
            all_words.extend(c['text'].lower().split())
    
    valid_words = [word for word in all_words if word in word2vec_model.wv]
    
    if not valid_words:  # Skip if there are no valid words
        continue
    
    student_convo_vector = np.mean([word2vec_model.wv[word] for word in valid_words], axis=0)
    
    similarity_score = cosine_similarity([student_convo_vector], [mean_vector_100])[0][0]
    
    code2similarity[code] = similarity_score

# Step 5: Convert the similarity scores to a DataFrame
similarity_df_v2v = pd.DataFrame(list(code2similarity.items()), columns=['code', 'similarity_with_100_word2vec'])

similarity_df_v2v.head(100)




##### Let's compare different approaches for a data point

In [None]:


# Define the target code you want to print
your_target_code = '089eb66d-4c3a-4f58-b98f-a3774a2efb34'

# Locate the row with the target code in the DataFrame
target_row = similarity_df.loc[similarity_df['code'] == your_target_code]
target_row2 = similarity_df2.loc[similarity_df['code'] == your_target_code]
target_rowv2v = similarity_df_v2v.loc[similarity_df['code'] == your_target_code]

# Check if the target code exists in the DataFrame
if not target_row.empty:
    # Print the row containing the target code
    print(target_row)
    print(target_row2)
    print(target_rowv2v)
else:
    print("Target code not found in the similarity_df.")


In [None]:

scores["code"] = scores["code"].apply(lambda x: x.strip())

# selecting the columns we need and we care
scores = scores[["code", "grade"]]

# show some examples
scores.head()

In [None]:
# Let's check grades distribution

plt.title('Histogram Grades')
plt.hist(scores["grade"], rwidth=.8, bins=np.arange(min(scores["grade"]), max(scores["grade"])+2) - 0.5)
plt.ylabel('Count')
plt.show()

#### Merging scores with features

In [None]:
temp_df = pd.merge(temp_df, question_mapping_scores, on="code", how="left")
temp_df.head()

In [None]:
# Merge temp_df with similarity_df on the 'code' column

#first update the column name
similarity_df.rename(columns={'similarity_with_100': 'similarity_with_stu_w_100_prompts'}, inplace=True)
temp_df = pd.merge(temp_df, similarity_df, on='code', how='left')

# Display the first few rows of the merged dataframe
temp_df.head()


In [None]:
# Merge temp_df with similarity_with_stu_w_100_codes_df on the 'code' column

#first update the column name
similarity_with_stu_w_100_codes_df.rename(columns={'student': 'code'}, inplace=True)
temp_df = pd.merge(temp_df, similarity_with_stu_w_100_codes_df, on='code', how='left')

# Display the first few rows of the merged dataframe
temp_df.head()

#### Handling With Left Skewed Data

In [None]:
df = temp_df.copy()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox

In [None]:
df_transform = df.copy()
df_transform['grade_log'] = np.log1p(df_transform['grade'])
df_transform['grade_boxcox'], _ = boxcox(df_transform['grade']+1)
df_transform['grade_sqrt'] = np.sqrt(df_transform['grade'])

In [None]:
# Plot original and transformed distributions
sns.histplot(df_transform['grade'], kde=True, label='Original')
plt.legend()
plt.show()

In [None]:
sns.histplot(df_transform['grade_log'], kde=True, label='Log-transformed')
plt.legend()
plt.show()

In [None]:
sns.histplot(df_transform['grade_boxcox'], kde=True, label='Boxcox-transformed')
plt.legend()
plt.show()

In [None]:
sns.histplot(df_transform['grade_sqrt'], kde=True, label='Sqrt-transformed')
plt.legend()
plt.show()

#### The grade distribution is closer to normal distribution when Boxcox transformation is applied.

In [None]:
# Replace 'grade' in df with 'grade_boxcox' from df_transform
df['grade'] = df_transform['grade_boxcox']


In [None]:
df.columns

#### Train/Test split

In [None]:
from sklearn.utils import shuffle

In [None]:
df_shuffled = shuffle(df, random_state=42)
cols_to_drop = ['grade', "code"]
X = df_shuffled.drop(cols_to_drop, axis=1)
y = df_shuffled['grade']


# Use stratified sampling in the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

## Features and Correlations

In [None]:
#copy df into check_df in order to transform obj->numeric
check_df = df.copy()

for col in check_df.columns:
    # Convert columns with 'object' dtype to numeric
    if check_df[col].dtype == 'object':
        check_df[col] = pd.to_numeric(check_df[col], errors='coerce')

# Now print the updated data types
print(check_df.dtypes)

In [None]:
#dropping code because it cannot be turned into numeric
check_df = check_df.drop(columns=['code'])
check_df

In [None]:
import seaborn as sns
correlation_matrix = check_df.corr()
target_correlation = correlation_matrix['grade'].sort_values(ascending=False)

# Plotting results in a heatmap
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

correlation_matrix = check_df.corr()

In [None]:
print(target_correlation)

### Dimensionality Reduction

In [None]:
from sklearn.preprocessing import RobustScaler
# Use a RobustScaler on the entire dataset
robust_scaler = RobustScaler()
X_scaled = robust_scaler.fit_transform(X)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30)  # Create PCA with a maximum of 30 components
pca.fit(X_scaled)  # Fit PCA on the data
exp_var_ratio = pca.explained_variance_ratio_  # Get explained variance ratios

n_comps = 0
count_exp_var = 0

desired_var = 0.95
# Loop to find the number of components needed to reach desired variance
for i in exp_var_ratio:
    n_comps += 1
    count_exp_var += i
    if count_exp_var >= desired_var:
        break

print("Number of components needed:", n_comps)


In [None]:
cumulative_variance_ratio = pca.explained_variance_ratio_.cumsum()

plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()

#### Good news. PC1 achieves 0.95 of variance.

In [None]:
# Transform the training and testing sets
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [None]:
#Hyperparameter tuning

from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and values to tune for regression
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)  # Replace X_train and y_train with your training data

# Get the best hyperparameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score (Negative Mean Squared Error):", best_score)


In [None]:
#gridsearch algorith bruteforced to use every paramather possible
#also it calculates each scoring type in order to determine where our data is stronger

from itertools import combinations

# Define the parameter grid for regression
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'max_features': [None, 'sqrt', 'log2', 0.25, 0.5, 0.75],
    'random_state': [None, 42, 100],
    'max_leaf_nodes': [None, 5, 10, 15],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'ccp_alpha': [0.0, 0.1, 0.2]
}

# Create the model for regression
model = DecisionTreeRegressor(random_state=42)

# Define the scoring methods
scoring_methods = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

# Initialize a dictionary to store the best parameters and scores for each scoring method
best_results = {method: {'params': None, 'score': float('-inf')} for method in scoring_methods}

# Iterate through each scoring method
for scoring_method in scoring_methods:
    # Initialize best parameters and best score for the current scoring method
    best_params = {}
    best_score = float('-inf') if scoring_method != 'r2' else float('inf')  # For R², higher is better

    # Iterate through each pair of hyperparameters
    for param1, param2 in combinations(param_grid.keys(), 2):
        param_set = {param1: param_grid[param1], param2: param_grid[param2]}
        grid_search = GridSearchCV(model, param_set, cv=5, scoring=scoring_method)
        grid_search.fit(X_train, y_train)  # Replace X_train, y_train with your training data
        cv_score = grid_search.best_score_
        
        # Update best score and params if current score is better
        is_better_score = cv_score > best_score if scoring_method != 'r2' else cv_score < best_score
        if is_better_score:
            best_score = cv_score
            best_params = grid_search.best_params_

    # Store the best results for the current scoring method
    best_results[scoring_method] = {'params': best_params, 'score': best_score}

# Print the best hyperparameters and scores for each scoring method
for method, result in best_results.items():
    print(f"Best Parameters for {method}: {result['params']}")
    print(f"Best Score for {method}: {result['score']}")


#### Fitting a model

#### Predicting and Analyzing 

In [None]:
regressor = DecisionTreeRegressor(random_state=0,criterion='squared_error', max_depth=10)
regressor.fit(X_train, y_train)

In [None]:
extracted_MSEs = regressor.tree_.impurity   
for idx, MSE in enumerate(regressor.tree_.impurity):
    print("Node {} has MSE {}".format(idx,MSE))

In [None]:
feature_names = ['#user_prompts', '#error', '#no', '#thank', '#next', '#entropy',
       'prompt_avg_chars', 'response_avg_chars', '#_hw_statements',
       '#_question_statements', '#_understand_statements',
       '#_example_statements', 'avg_code_length', 'code_response_ratio',
       'imperative_sentence_count', '#given_errors', 'is_info_gain_negative',
       'avg_sentences_in_prompts', '#sentences_in_prompts', 'is_dataset_given', 'Q_0', 'Q_1', 'Q_2', 'Q_3', 'Q_4', 'Q_5', 'Q_6', 'Q_7', 'Q_8',
       'similarity_with_stu_w_100_prompts', 'similarity_with_stu_w_100_codes']

In [None]:
# Plotting the Tree 
dot_data = tree.export_graphviz(regressor, out_file=None, feature_names=feature_names)
graph = graphviz.Source(dot_data)
graph.render("hw")

In [None]:
# Prediction
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

# Calculation of Mean Squared Error (MSE)
print("MSE Train:", mean_squared_error(y_train,y_train_pred))
print("MSE TEST:", mean_squared_error(y_test,y_test_pred))

print("R2 Train:", r2_score(y_train,y_train_pred))
print("R2 TEST:", r2_score(y_test,y_test_pred))


Gradient Boost and hyperparameter tuning

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
# Initialize the GradientBoostingRegressor
gb = GradientBoostingRegressor()

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)


In [None]:
from sklearn.preprocessing import StandardScaler
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameters for the Gradient Boosting model
params = {
    'learning_rate': 0.01,
    'max_depth': 3,
    'min_samples_leaf': 2,
    'min_samples_split': 10,
    'n_estimators': 200
}

# Create a Gradient Boosting Regressor model
gb_model = GradientBoostingRegressor(**params)

# Train the model
gb_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = gb_model.predict(X_test_scaled)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error on Test Set: ", mse)
print("R-squared (R2) Score on Test Set: ", r2)

Elastic Net Regression:

    Combines L1 and L2 regularization. It helps when there are many correlated features.


In [None]:
from sklearn.linear_model import ElasticNet


#getting best hyperparameters
# Define a range of values for alpha and l1_ratio
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Create an Elastic Net model
elastic_net = ElasticNet()

# Create GridSearchCV object
grid_search = GridSearchCV(elastic_net, param_grid, cv=5, scoring='neg_mean_squared_error')

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']


print("Best Hyperparameters: alpha =", best_alpha, ", l1_ratio =", best_l1_ratio)



In [None]:
# Create an Elastic Net model
elastic_net = ElasticNet(alpha=10.0, l1_ratio=0.7)  # You can adjust alpha and l1_ratio based on your needs

# Train the model
elastic_net.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = elastic_net.predict(X_test_scaled)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error on Test Set: ", mse)
print("R-squared (R2) Score on Test Set: ", r2)



