In [3]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("RJT1990/GeneralThoughtArchive")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.DataFrame(ds['train'])
print(df.head())

   question_id                                       question_url  \
0       806845  https://gr.inc/question/how-do-the-neural-resp...   
1      1730456  https://gr.inc/question/lets-consider-some-arr...   
2      3236068  https://gr.inc/question/what-are-the-primary-c...   
3      1717140  https://gr.inc/question/consider-a-football-to...   
4      3235833  https://gr.inc/question/given-the-context-of-h...   

                                            question  \
0     How do the neural respiratory centers operate?   
1  Let's consider some array A. The following alg...   
2  What are the primary criticisms Aristotle rais...   
3  Consider a football tournament where n teams p...   
4  Given the context of Heidegger's philosophy, p...   

                                    reference_answer prev_messages  \
0  In the medulla oblongata, respiratory neurons ...          None   
1                                               None          None   
2  Aristotle's criticisms of Plato's t

In [6]:
df.shape

(430788, 14)

In [7]:
# Check data types, non-null counts, memory usage
print(df.info())
print("\nColumn names:")
print(df.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430788 entries, 0 to 430787
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   question_id               430788 non-null  int64  
 1   question_url              430788 non-null  object 
 2   question                  430788 non-null  object 
 3   reference_answer          306178 non-null  object 
 4   prev_messages             102234 non-null  object 
 5   model_name                430788 non-null  object 
 6   model_answer              430788 non-null  object 
 7   model_reasoning           428151 non-null  object 
 8   task                      430788 non-null  object 
 9   question_license          429474 non-null  object 
 10  question_source           430788 non-null  object 
 11  community_answer_score    430788 non-null  int64  
 12  community_question_score  430788 non-null  int64  
 13  verifier_score            338377 non-null  f

In [8]:
# Missing value analysis
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nMissing value percentages:")
print((df.isnull().sum() / len(df)) * 100)


Missing values per column:
question_id                      0
question_url                     0
question                         0
reference_answer            124610
prev_messages               328554
model_name                       0
model_answer                     0
model_reasoning               2637
task                             0
question_license              1314
question_source                  0
community_answer_score           0
community_question_score         0
verifier_score               92411
dtype: int64

Missing value percentages:
question_id                  0.000000
question_url                 0.000000
question                     0.000000
reference_answer            28.926061
prev_messages               76.268141
model_name                   0.000000
model_answer                 0.000000
model_reasoning              0.612134
task                         0.000000
question_license             0.305022
question_source              0.000000
community_answer_score 

In [9]:
# Check if critical text fields are populated
print("\nSample of model_reasoning (first 3 rows):")
for i in range(min(3, len(df))):
    print(f"\n--- Row {i} ---")
    print(df.iloc[i]['model_reasoning'][:500])  # First 500 chars


Sample of model_reasoning (first 3 rows):

--- Row 0 ---
Okay, so I need to figure out how the neural respiratory centers work. Let me start by recalling what I know about breathing and the nervous system. I remember that breathing is controlled by parts of the brain, right? There's the medulla and the pons mentioned in some biology classes. But wait, how exactly do they work together?

First, there's the medulla oblongata. I think that's the primary control center. It has groups of neurons called the dorsal respiratory group and the ventral respirato

--- Row 1 ---
Okay, let's see. I need to find the maximum possible force of an array after reordering some elements. The force is calculated by summing the squares of the lengths of consecutive equal elements. So the problem is, given the array generated by a specific recurrence relation, how can I rearrange its elements to maximize this sum of squares?

Hmm. The key here is that the optimal arrangement groups as many same elements as p

In [10]:
# Check reasoning field completeness
print("\nModel reasoning missing:")
print(df['model_answer'].isnull().sum())

# Drop rows without reasoning (can't extract features)
df_clean = df[df['model_answer'].notna()].copy()
print(f"\nDataset size after removing missing reasoning: {df_clean.shape}")



Model reasoning missing:
0

Dataset size after removing missing reasoning: (430788, 14)


In [11]:
import re

# Token count
df_clean['feat_token_count'] = df_clean['model_answer'].apply(lambda x: len(str(x).split()))

# Sentence count (periods, exclamation, question marks)
df_clean['feat_sentence_count'] = df_clean['model_answer'].apply(
    lambda x: len(re.findall(r'[.!?]+', str(x)))
)

# Step markers (1., 2., Step 1, etc.)
df_clean['feat_step_markers'] = df_clean['model_answer'].apply(
    lambda x: len(re.findall(r'\b(?:step\s*\d+|^\d+\.|\n\d+\.)', str(x).lower()))
)

# Average sentence length
df_clean['feat_avg_sentence_len'] = (df_clean['feat_token_count'] / 
                                      (df_clean['feat_sentence_count'] + 1))

# Max sentence length (split by periods, find longest)
def max_sent_len(text):
    sentences = re.split(r'[.!?]+', str(text))
    if not sentences:
        return 0
    return max(len(s.split()) for s in sentences)

df_clean['feat_max_sentence_len'] = df_clean['model_answer'].apply(max_sent_len)


In [12]:
# LaTeX operator counts (sum, frac, etc.)
df_clean['feat_latex_operators'] = df_clean['model_answer'].apply(
    lambda x: len(re.findall(r'\\(?:frac|sum|int|times|div|sqrt)', str(x)))
)

# Digit ratio (proportion of characters that are digits)
df_clean['feat_digit_ratio'] = df_clean['model_answer'].apply(
    lambda x: sum(c.isdigit() for c in str(x)) / (len(str(x)) + 1)
)

# Equation sign density (=, +, -, *, /)
df_clean['feat_equation_signs'] = df_clean['model_answer'].apply(
    lambda x: len(re.findall(r'[=+\-*/]', str(x)))
)

# Operator variety (unique math operators)
def operator_variety(text):
    operators = set(re.findall(r'[+\-*/=<>≤≥]', str(text)))
    return len(operators)

df_clean['feat_operator_variety'] = df_clean['model_answer'].apply(operator_variety)

# Code block presence (``````)
df_clean['feat_has_code_block'] = df_clean['model_answer'].apply(
    lambda x: 1 if '```' in str(x) else 0
)


In [None]:
# Connective words (because, therefore, thus, so, hence)
logic_connectives = ['because', 'therefore', 'thus', 'hence', 'since', 'consequently']
df_clean['feat_logic_connectives'] = df_clean['model_answer'].apply(
    lambda x: sum(str(x).lower().count(word) for word in logic_connectives)
)

# Self-correction markers (wait, actually, correction, mistake)
correction_words = ['wait', 'actually', 'correction', 'mistake', 'error', 'wrong']
df_clean['feat_self_corrections'] = df_clean['model_answer'].apply(
    lambda x: sum(str(x).lower().count(word) for word in correction_words)
)

# Contradiction indicators (but, however near numbers)
def contradictions_near_numbers(text):
    text_lower = str(text).lower()
    # Find "but" or "however" within 10 words of a number
    matches = re.findall(r'(?:\d+.{0,50}(?:but|however))|(?:(?:but|however).{0,50}\d+)', text_lower)
    return len(matches)

df_clean['feat_contradiction_markers'] = df_clean['model_answer'].apply(contradictions_near_numbers)


In [None]:
# Question-to-reasoning overlap (Jaccard similarity)
def jaccard_similarity(text1, text2):
    set1 = set(str(text1).lower().split())
    set2 = set(str(text2).lower().split())
    if not set1 or not set2:
        return 0
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

df_clean['feat_question_overlap'] = df_clean.apply(
    lambda row: jaccard_similarity(row['question'], row['model_reasoning']), axis=1
)

# Extra numbers (numbers in reasoning not in question)
def extract_numbers(text):
    return set(re.findall(r'\d+\.?\d*', str(text)))

df_clean['feat_extra_numbers'] = df_clean.apply(
    lambda row: len(extract_numbers(row['model_reasoning']) - extract_numbers(row['question'])),
    axis=1
)

# Final answer formatting (presence of "Answer:", "Final answer:", etc.)
df_clean['feat_has_answer_marker'] = df_clean['model_reasoning'].apply(
    lambda x: 1 if re.search(r'\b(?:answer|conclusion|result):', str(x).lower()) else 0
)


In [None]:
# Type-token ratio (unique words / total words)
def type_token_ratio(text):
    words = str(text).lower().split()
    if not words:
        return 0
    return len(set(words)) / len(words)

df_clean['feat_type_token_ratio'] = df_clean['model_reasoning'].apply(type_token_ratio)

# Punctuation density
df_clean['feat_punctuation_density'] = df_clean['model_reasoning'].apply(
    lambda x: len(re.findall(r'[.,;:!?]', str(x))) / (len(str(x)) + 1)
)

# Parentheses balance (are they properly matched?)
def parentheses_balanced(text):
    count = 0
    for char in str(text):
        if char == '(':
            count += 1
        elif char == ')':
            count -= 1
        if count < 0:
            return 0  # Unbalanced
    return 1 if count == 0 else 0

df_clean['feat_parentheses_balanced'] = df_clean['model_reasoning'].apply(parentheses_balanced)

# Repeated n-gram rate (3-grams that appear more than once)
def repeated_trigrams(text):
    words = str(text).lower().split()
    if len(words) < 3:
        return 0
    trigrams = [' '.join(words[i:i+3]) for i in range(len(words)-2)]
    if not trigrams:
        return 0
    unique_trigrams = len(set(trigrams))
    return 1 - (unique_trigrams / len(trigrams))

df_clean['feat_repeated_trigrams'] = df_clean['model_reasoning'].apply(repeated_trigrams)
