In [None]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import random
from sklearn.feature_extraction.text import CountVectorizer

: 

In [160]:
file_path = "/content/folio_parsed.json"

In [161]:
with open(file_path, 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [162]:
# removing ’ since it messes with my regex
df['FOL'] = df['FOL'].str.replace("’", "", regex=False)

In [163]:
# converts fol statement from isHuman(x) ∧ isOld(x) to P(O) ∧ P(O)
def transform_fol_statement(fol_statement):
    predicate_pattern = re.compile(r"([a-zA-Z_][-'\w\s]*)\(([^()]+)\)") # handles hyphens, apostrophes, etcc
    predicates = predicate_pattern.findall(fol_statement)

    for predicate, args in predicates:
        predicate = predicate.strip()
        args = [arg.strip() for arg in args.split(',')]  # remove whitespace from each argument
        abstract_args = ', '.join(['O'] * len(args))
        fol_statement = fol_statement.replace(f'{predicate}({", ".join(args)})', f'P({abstract_args})')

    return fol_statement

In [164]:
df['Transformed_FOL'] = df['FOL'].apply(transform_fol_statement)

In [174]:
# Tokenize based on logical structure
def fol_tokenizer(text):
    tokens = re.findall(r'\w+|[∀∃¬∧∨→↔⊕(),]', text)
    return tokens

In [166]:
vectorizer = CountVectorizer(tokenizer=fol_tokenizer)
X = vectorizer.fit_transform(df['Transformed_FOL'])



In [167]:
kmeans = KMeans(n_clusters=5, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

In [168]:
sampled_triplets = []
for cluster in range(5):
    cluster_df = df[df['Cluster'] == cluster]
    sampled_triplet = cluster_df.sample(n=4)
    sampled_triplets.append(sampled_triplet)

In [183]:
sampled_triplets[0]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
487,Daniel’s dad and older sister both graduated f...,GraduatedFromStanfordUniversity(danielsOlderS...,P(O) ∧ P(O),0


In [170]:
sampled_triplets[1]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
1340,"If Stephen Curry is not an NBA player, then St...",¬NBAPlayer(stephencurry) → ¬(NBAPlayer(stephen...,¬P(O) → ¬(P(O) ⊕ P(O)),1


In [184]:
sampled_triplets[2]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
1467,All planes in Delta are of type Boeing-737.,∀x (Delta(x) → Boeing737(x)),∀x (P(O) → P(O)),2


In [185]:
sampled_triplets[3]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
1886,Harry is smarter than before.,Smarter(harry),P(O),3


In [186]:
sampled_triplets[4]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
1279,Benjamin only ate oranges and grapes at the pa...,"Eat(orange, benjamin) ∧ Eat(grape, benjamin) ∧...","P(O, O) ∧ P(O, O) ∧ ¬P(O, O) ∧ ¬P(O, O) ∧ ¬P(O...",4


In [182]:
few_shots = [487, 1340, 1467, 1886, 1279]
for idx in few_shots:
  print(df.iloc[idx][0])
  print(df.iloc[idx][1])
  print()

Daniel’s dad and older sister both graduated from Stanford University.
 GraduatedFromStanfordUniversity(danielsOlderSister) ∧ GraduatedFromStanfordUniversity(danielsDad)

If Stephen Curry is not an NBA player, then Stephen Curry is not an NBA player or a soccer player.
¬NBAPlayer(stephencurry) → ¬(NBAPlayer(stephencurry) ⊕ SoccerPlayer(stephencurry))

All planes in Delta are of type Boeing-737. 
∀x (Delta(x) → Boeing737(x))

Harry is smarter than before.
Smarter(harry)

Benjamin only ate oranges and grapes at the party.
Eat(orange, benjamin) ∧ Eat(grape, benjamin) ∧ ¬Eat(blueberry, benjamin) ∧ ¬Eat(cherry, benjamin) ∧ ¬Eat(strawberry, benjamin)

