In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score

from sklearn import tree as treeViz
import graphviz
from IPython.display import display
from dataset_reader import DataReader

In [57]:
file_name = "training_data_clean.csv"

def process_multiselect(series, target_tasks):
    """Convert multiselect strings to lists, keeping only specified features"""
    processed = []
    for response in series:
        if pd.isna(response) or response == '':
            processed.append([])
        else:
            # Check which of the target tasks are present in the response
            present_tasks = [task for task in target_tasks if task in str(response)]
            processed.append(present_tasks)
    return processed

def extract_rating(response):
    """
    Extract numeric rating from responses like '3 - Sometimes'.
    Returns None for missing responses
    """
    match = re.match(r'^(\d+)', str(response))
    return int(match.group(1)) if match else None

def visualize_tree(model, max_depth=5):
    """ 
    Generate and return an image representing an Sklearn decision tree.

    Each node in the visualization represents a node in the decision tree.
    In addition, visualization for each node contains:
        - The feature that is split on
        - The entropy (of the outputs `t`) at the node
        - The number of training samples at the node
        - The number of training samples with true/false values
        - The majority class (heart disease or not)
    The colour of the node also shows the majority class and purity

    See here: https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html

    Parameters:
        `model` - An Sklearn decision tree model
        `max_depth` - Max depth of decision tree to be rendered in the notebook.
         This is useful since the tree can get very large if the max_depth is
         set too high and thus making the resulting figure difficult to interpret.
    """
    dot_data = treeViz.export_graphviz(model,
                                       feature_names=target_tasks,
                                       max_depth=max_depth,
                                       class_names=['ChatGPT', 'Claude', 'Gemini'],
                                       filled=True,
                                       rounded=True)
    return display(graphviz.Source(dot_data))

In [58]:
df = pd.read_csv(file_name)
df.dropna(inplace=True)

In [61]:
target_tasks = [
        'Math computations',
        'Writing or debugging code',
        'Data processing or analysis', 
        'Explaining complex concepts simply',
        'Converting content between formats (e.g., LaTeX)',
        'Writing or editing essays/reports',
        'Drafting professional text (e.g., emails, résumés)',
        'Brainstorming or generating creative ideas'
    ]
best_tasks_lists = process_multiselect(df['Which types of tasks do you feel this model handles best? (Select all that apply.)'], target_tasks)
suboptimal_tasks_lists = process_multiselect(df['For which types of tasks do you feel this model tends to give suboptimal responses? (Select all that apply.)'], target_tasks)


mlb_best = MultiLabelBinarizer()
mlb_subopt = MultiLabelBinarizer()
    
best_tasks_encoded = mlb_best.fit_transform(best_tasks_lists)
suboptimal_tasks_encoded = mlb_subopt.fit_transform(suboptimal_tasks_lists)

# Use some rating features
academic_numeric = df['How likely are you to use this model for academic tasks?'].apply(extract_rating)
subopt_numeric = df['Based on your experience, how often has this model given you a response that felt suboptimal?'].apply(extract_rating)
# Combine features
X = np.hstack([academic_numeric.values.reshape(-1, 1), subopt_numeric.values.reshape(-1, 1), best_tasks_encoded, suboptimal_tasks_encoded])
y = df['label'].values

n_train = int(0.7 * len(X))
X_train, y_train, X_test, y_test = X[:n_train], y[:n_train], X[n_train:], y[n_train:]

print(X.shape)


(734, 18)


In [None]:
"""dr = DataReader("training_data_clean.csv")
X, y = dr.to_numpy()
print(X.shape)
print(y.shape)
le = LabelEncoder()
y_encoded = le.fit_transform(y) 

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)
"""

(734, 6170)
(734,)
[0 1 1 ... 0 0 0]


In [90]:
n_estimators = [300, 400]

for i in n_estimators:
    print("Using " + str(i) + " trees:")
    model = RandomForestClassifier(
        n_estimators=i, 
        min_samples_split=20,
        min_samples_leaf=10, 
        #random_state=42,
    )

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    print("train acc:", accuracy_score(y_train, y_train_pred))
    print("test  acc:", accuracy_score(y_test, y_test_pred))

#my stuff
#train_acc = model.score(X_train, y_train)
#test_acc = model.score(X_test, y_test)

#print(f"Training accuracy: {train_acc:.3f}")
#print(f"Test accuracy: {test_acc:.3f}")

Using 300 trees:
train acc: 0.6842105263157895
test  acc: 0.7647058823529411
Using 400 trees:
train acc: 0.6881091617933723
test  acc: 0.7647058823529411


In [None]:
# Access the individual decision trees
for i, tree in enumerate(model.estimators_):
    print(tree.get_params())