# Abstract Algebra Dataset

In [2]:
import sys
import os
import pandas as pd
import json

from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath('../'))

## Loading and preprocessing the data

### Putnam Bench (273)

- 25 rows for `{'abstract_algebra'}`

In [1]:
from datasets import load_dataset
ds = load_dataset("amitayusht/PutnamBench")

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
df = pd.DataFrame(ds['train'])

In [4]:
df = df.drop(columns=['coq_statement', 'isabelle_statement', 'informal_solution'])
df = df.dropna()
df = df.rename(columns={'lean4_statement': 'formal_statement'})

df['header'] = df['formal_statement'].apply(lambda x: x.split('theorem', 1)[0])
df['formal_statement'] = df['formal_statement'].apply(lambda x: 'theorem' + x.split('theorem', 1)[1] if 'theorem' in x else x)

In [5]:
all_tags = set()

for tags in df.tags.unique():
    tags = tags.replace('[', '').replace(']', '').replace('\'', '')
    tags = tags
    tag_list = tags.split(',')
    for tag in tag_list:
        all_tags.add(tag.strip())

print(all_tags)

{'set_theory', 'number_theory', 'abstract_algebra', 'combinatorics', 'analysis', 'linear_algebra', 'probability', 'geometry', 'algebra'}


In [6]:
wanted_tags = ['abstract_algebra']
putnam_df = df[df.tags.str.contains('|'.join(wanted_tags))]

putnam_NT_df = df[df.tags.str.contains('abstract_algebra')]


In [7]:
# shuffle df
putnam_NT_df = putnam_NT_df.sample(frac=1).reset_index(drop=True)
putnam_NT_df['split'] = ['valid' if x < len(putnam_NT_df) / 2 else 'test' for x in range(len(putnam_NT_df))]

In [8]:
len(putnam_NT_df)

25

In [9]:
putnam_NT_df.head()

Unnamed: 0,name,formal_statement,informal_statement,tags,header,split
0,putnam_1985_b6,theorem putnam_1985_b6\n(n : ℕ)\n(npos : n > 0...,Let $G$ be a finite set of real $n\times n$ ma...,"['abstract_algebra', 'linear_algebra']",,valid
1,putnam_2009_a5,theorem putnam_2009_a5\n: (∃ (G : Type*) (_ : ...,Is there a finite abelian group $G$ such that ...,['abstract_algebra'],abbrev putnam_2009_a5_solution : Prop := sorry...,valid
2,putnam_1969_b2,theorem putnam_1969_b2\n(G : Type*)\n[Group G]...,Show that a finite group can not be the union ...,['abstract_algebra'],abbrev putnam_1969_b2_solution : Prop := sorry...,valid
3,putnam_1977_b6,theorem putnam_1977_b6\n[Group G]\n(H : Subgro...,Let $G$ be a group and $H$ be a subgroup of $G...,['abstract_algebra'],,valid
4,putnam_2012_a2,theorem putnam_2012_a2\n(S : Type*) [CommSemig...,Let $*$ be a commutative and associative binar...,['abstract_algebra'],,valid


#### Look at the data:

In [10]:
print(putnam_NT_df.iloc[0].informal_statement)

Let $G$ be a finite set of real $n\times n$ matrices $\{M_i\}$, $1 \leq i \leq r$, which form a group under matrix
multiplication. Suppose that $\sum_{i=1}^r \mathrm{tr}(M_i)=0$, where $\mathrm{tr}(A)$ denotes the trace of the matrix $A$. Prove that $\sum_{i=1}^r M_i$ is the $n \times n$ zero matrix.


In [11]:
print(putnam_NT_df.iloc[0].formal_statement)

theorem putnam_1985_b6
(n : ℕ)
(npos : n > 0)
(G : Finset (Matrix (Fin n) (Fin n) ℝ))
(groupG : (∀ g ∈ G, ∀ h ∈ G, g * h ∈ G) ∧ 1 ∈ G ∧ (∀ g ∈ G, ∃ h ∈ G, g * h = 1))
(hG : ∑ M in G, Matrix.trace M = 0)
: (∑ M in G, M = 0) :=
sorry


In [12]:
NT_tag = putnam_NT_df.iloc[3].tags
print(NT_tag)

['abstract_algebra']


In [56]:
raise Exception('stop')

Exception: stop

## Export

In [13]:
final_df = putnam_NT_df

In [14]:
final_df[final_df['split'] == 'test'].to_json('ADV_test.jsonl', orient='records', lines=True)
final_df[final_df['split'] == 'valid'].to_json('ADV_validation.jsonl', orient='records', lines=True)