# Discrete Math Dataset

In [2]:
import sys
import os
import pandas as pd
import json

from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath('../'))

## Loading and preprocessing the data

### Proof Net (?)

`hoskinson-center/proofnet` is in LEAN 3, we use the LEAN 4 translation used in the paper *DeepSeek Prover*

Not much discrete math, maybe subsets `Ireland-rosen` or `Rudin`?

In [15]:
df = pd.read_json('proofnet.jsonl', lines=True)

In [16]:
df['name'] = 'proofnet_' + df['name']
df = df.drop(columns=['goal'])

In [17]:
rosen_df = df[df['name'].str.contains('rosen')]
print(rosen_df)

Empty DataFrame
Columns: [name, split, informal_prefix, formal_statement, header]
Index: []


In [18]:
df.head()

Unnamed: 0,name,split,informal_prefix,formal_statement,header
0,proofnet_exercise_1_13a,valid,/-- Suppose that $f$ is holomorphic in an open...,theorem exercise_1_13a {f : ℂ → ℂ} (Ω : Set ℂ)...,import Mathlib\n\nopen Complex Filter Function...
1,proofnet_exercise_1_13b,test,/-- Suppose that $f$ is holomorphic in an open...,theorem exercise_1_13b {f : ℂ → ℂ} (Ω : Set ℂ)...,import Mathlib\n\nopen Complex Filter Function...
2,proofnet_exercise_1_13c,valid,/-- Suppose that $f$ is holomorphic in an open...,theorem exercise_1_13c {f : ℂ → ℂ} (Ω : Set ℂ)...,import Mathlib\n\nopen Complex Filter Function...
3,proofnet_exercise_1_19a,test,/-- Prove that the power series $\sum nz^n$ do...,theorem exercise_1_19a (z : ℂ) (hz : abs z = 1...,import Mathlib\n\nopen Complex Filter Function...
4,proofnet_exercise_1_19b,valid,/-- Prove that the power series $\sum zn/n^2$ ...,theorem exercise_1_19b (z : ℂ) (hz : abs z = 1...,import Mathlib\n\nopen Complex Filter Function...


In [19]:
print(putnam_NT_putnam_NT_df.iloc[0].informal_prefix)

/-- Suppose that $f$ is holomorphic in an open set $\Omega$. Prove that if $\text{Re}(f)$ is constant, then $f$ is constant.-/



In [20]:
print(df.iloc[0].formal_statement)

theorem exercise_1_13a {f : ℂ → ℂ} (Ω : Set ℂ) (a b : Ω) (h : IsOpen Ω)
  (hf : DifferentiableOn ℂ f Ω) (hc : ∃ (c : ℝ), ∀ z ∈ Ω, (f z).re = c) :
  f a = f b :=


In [21]:
topics = set()
for i in range(len(df)):
    topics.update(df.iloc[i]['header'].split('\n'))

for topic in topics:
    print(topic)


open scoped BigOperators
| 0 => sqrt 2
  generateFrom {S : Set X | ∃ a b, a < b ∧ S = Ico a b}
def countably_compact (X : Type*) [TopologicalSpace X] :=
def rational (x : ℝ) := x ∈ range ((↑) : ℚ → ℝ)
  ∀ U : Set X, Infinite U → ∃ x ∈ U, ClusterPt x (𝓟 U)
noncomputable section
def lower_limit_topology (X : Type) [Preorder X] :=
  (∀ i, IsOpen (U i)) ∧ ((univ : Set X) ⊆ ⋃ i, U i) →
  (∃ t : Finset ℕ, (univ : Set X) ⊆ ⋃ i ∈ t, U i)
open Topology Filter Real Complex TopologicalSpace Finset
| (n + 1) => sqrt (2 + sqrt (f n))
open scoped BigOperators Topology
open Fintype Set Real Ideal Polynomial
  ∀ U : ℕ → Set X,
  univ ∈ T ∧
import Mathlib
  ({S : Set ℝ | ∃ a b, a < b ∧ S = Ioo a b} ∪ {S : Set ℝ | ∃ a b, a < b ∧ S = Ioo a b \ K})
open Complex Filter Function Metric Finset
def K : Set ℝ := {r | ∃ n : ℕ, r = 1 / n}
noncomputable def f : ℕ → ℝ
def g (n : ℕ) : ℝ := sqrt (n + 1) - sqrt n
set_option checkBinderAnnotations false
open Filter Real Function
open RingHom
--center of (G × H) equiv

In [23]:
raise Exception('stop')

Exception: stop

### Putnam Bench (110 - 98)

- 110 rows for `{'set_theory', 'probability', 'number_theory', 'combinatorics'}`
- 98 rows for `number_theory`

In [46]:
ds = load_dataset("amitayusht/PutnamBench")

Repo card metadata block was not found. Setting CardData to empty.


In [47]:
df = pd.DataFrame(ds['train'])

In [48]:
df = df.drop(columns=['coq_statement', 'isabelle_statement', 'informal_solution'])
df = df.dropna()
df = df.rename(columns={'lean4_statement': 'formal_statement'})

df['header'] = df['formal_statement'].apply(lambda x: x.split('theorem', 1)[0])
df['formal_statement'] = df['formal_statement'].apply(lambda x: 'theorem' + x.split('theorem', 1)[1] if 'theorem' in x else x)

In [49]:
all_tags = set()

for tags in df.tags.unique():
    tags = tags.replace('[', '').replace(']', '').replace('\'', '')
    tags = tags
    tag_list = tags.split(',')
    for tag in tag_list:
        all_tags.add(tag.strip())

print(all_tags)

{'set_theory', 'number_theory', 'combinatorics', 'abstract_algebra', 'probability', 'analysis', 'algebra', 'linear_algebra', 'geometry'}


In [50]:
wanted_tags = ['set_theory', 'probability', 'number_theory', 'combinatorics']
putnam_df = df[df.tags.str.contains('|'.join(wanted_tags))]

putnam_NT_df = df[df.tags.str.contains('number_theory')]


In [51]:
# shuffle df
putnam_NT_df = putnam_NT_df.sample(frac=1).reset_index(drop=True)
putnam_NT_df['split'] = ['valid' if x < len(putnam_NT_df) / 2 else 'test' for x in range(len(putnam_NT_df))]

In [59]:
putnam_NT_df.head()

Unnamed: 0,name,formal_statement,informal_statement,tags,header,split
0,putnam_1992_a3,theorem putnam_1992_a3\n(m : ℕ)\n(mpos : m > 0...,"For a given positive integer $m$, find all tri...","['algebra', 'number_theory']",abbrev putnam_1992_a3_solution : ℕ → Set (ℕ × ...,valid
1,putnam_2000_b2,"theorem putnam_2000_b2\n: (∀ m n : ℕ, m ≥ 1 → ...","Prove that the expression\n\[\n\frac{gcd(m,n)}...","['number_theory', 'algebra']",,valid
2,putnam_2014_a1,theorem putnam_2014_a1\n(f : ℝ → ℝ)\n(hf : ∀ x...,Prove that every nonzero coefficient of the Ta...,"['analysis', 'number_theory']",,valid
3,putnam_1976_a3,"theorem putnam_1976_a3\n: {(p, r, q, s) : ℕ × ...","Find all integer solutions $(p, r, q, s)$ of t...",['number_theory'],abbrev putnam_1976_a3_solution : Set (ℕ × ℕ × ...,valid
4,putnam_2002_b6,theorem putnam_2002_b6\n(p : ℕ)\n(hp : Nat.Pri...,Let $p$ be a prime number. Prove that the dete...,"['linear_algebra', 'number_theory', 'algebra']",,valid


#### Look at the data:

In [54]:
print(putnam_NT_df.iloc[0].informal_statement)

For a given positive integer $m$, find all triples $(n, x, y)$ of positive integers, with $n$ relatively prime to $m$, which satisfy
\[
(x^2 + y^2)^m = (xy)^n.
\]


In [55]:
print(putnam_NT_df.iloc[0].formal_statement)

theorem putnam_1992_a3
(m : ℕ)
(mpos : m > 0)
(S : Set (ℕ × ℕ × ℕ))
(hS : ∀ n x y : ℕ, (n, x, y) ∈ S ↔ n > 0 ∧ x > 0 ∧ y > 0 ∧ Coprime n m ∧ (x ^ 2 + y ^ 2) ^ m = (x * y) ^ n)
: (S = putnam_1992_a3_solution m) :=
sorry


In [65]:
NT_tag = putnam_NT_df.iloc[3].tags
print(NT_tag)

['number_theory']


In [56]:
raise Exception('stop')

Exception: stop

### Mini F2F (136)

Keep only rows about `number_theory`

In [84]:
df = pd.read_json('minif2f.jsonl', lines=True)

In [85]:
df['name'] = 'f2f_' + df['name']
df['tags'] = NT_tag

df = df.drop(columns=['goal'])
df = df.rename(columns={'informal_prefix': 'informal_statement'})
df = df[df['name'].str.contains('numbertheory')]

df = df[['name', 'formal_statement', 'informal_statement', 'tags', 'header',
       'split']]

In [86]:
df['tags'] = NT_tag

In [87]:
df.head()

Unnamed: 0,name,formal_statement,informal_statement,tags,header,split
5,f2f_mathd_numbertheory_780,theorem mathd_numbertheory_780 (m x : ℤ) (h₀ :...,/-- Suppose $m$ is a two-digit positive intege...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
7,f2f_mathd_numbertheory_13,theorem mathd_numbertheory_13 (u v : ℕ) (S : S...,/-- What is the average of the two smallest po...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
8,f2f_mathd_numbertheory_169,theorem mathd_numbertheory_169 : Nat.gcd 20! 2...,/-- What is the greatest common factor of $20 ...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
14,f2f_mathd_numbertheory_149,theorem mathd_numbertheory_149 :\n (∑ k in Fi...,"/-- A group of $N$ students, where $N < 50$, i...",['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
20,f2f_mathd_numbertheory_221,theorem mathd_numbertheory_221 (S : Finset ℕ)\...,/-- How many natural numbers less than 1000 ha...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid


#### Look at the data:

In [88]:
print(df.iloc[0].informal_statement)

/-- Suppose $m$ is a two-digit positive integer such that $6^{-1}\pmod m$ exists and $6^{-1}\equiv 6^2\pmod m$. What is $m$? Show that it is 43.-/



In [89]:
print(df.iloc[0].header)

import Mathlib
import Aesop

set_option maxHeartbeats 0

open BigOperators Real Nat Topology Rat




In [90]:
print(df.iloc[0].formal_statement)

theorem mathd_numbertheory_780 (m x : ℤ) (h₀ : 0 ≤ x) (h₁ : 10 ≤ m ∧ m ≤ 99) (h₂ : 6 * x % m = 1)
  (h₃ : (x - 6 ^ 2) % m = 0) : m = 43 := by



## Aggregating the data (234)

In [97]:
print(df.columns)
print(putnam_NT_df.columns)

Index(['name', 'formal_statement', 'informal_statement', 'tags', 'header',
       'split'],
      dtype='object')
Index(['name', 'formal_statement', 'informal_statement', 'tags', 'header',
       'split'],
      dtype='object')


In [96]:
final_df = pd.concat([df, putnam_NT_df], ignore_index=True)

In [100]:
final_df.head()

Unnamed: 0,name,formal_statement,informal_statement,tags,header,split
0,f2f_mathd_numbertheory_780,theorem mathd_numbertheory_780 (m x : ℤ) (h₀ :...,/-- Suppose $m$ is a two-digit positive intege...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
1,f2f_mathd_numbertheory_13,theorem mathd_numbertheory_13 (u v : ℕ) (S : S...,/-- What is the average of the two smallest po...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
2,f2f_mathd_numbertheory_169,theorem mathd_numbertheory_169 : Nat.gcd 20! 2...,/-- What is the greatest common factor of $20 ...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
3,f2f_mathd_numbertheory_149,theorem mathd_numbertheory_149 :\n (∑ k in Fi...,"/-- A group of $N$ students, where $N < 50$, i...",['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid
4,f2f_mathd_numbertheory_221,theorem mathd_numbertheory_221 (S : Finset ℕ)\...,/-- How many natural numbers less than 1000 ha...,['number_theory'],import Mathlib\nimport Aesop\n\nset_option max...,valid


## Export

In [103]:
final_df[final_df['split'] == 'test'].to_json('NT_test.jsonl', orient='records', lines=True)
final_df[final_df['split'] == 'valid'].to_json('NT_validation.jsonl', orient='records', lines=True)