In [26]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import random
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
file_path = "/content/folio_parsed.json"

In [None]:
with open(file_path, 'r') as file:
    data = json.load(file)

df = pd.DataFrame(data)

In [None]:
# removing ’ since it messes with my regex
df['FOL'] = df['FOL'].str.replace("’", "", regex=False)

In [29]:
# converts fol statement from isHuman(x) ∧ isOld(x) to P(O) ∧ P(O)
def transform_fol_statement(fol_statement):
    predicate_pattern = re.compile(r"([a-zA-Z_][-'\w\s]*)\(([^()]+)\)") # handles hyphens, apostrophes, etcc
    predicates = predicate_pattern.findall(fol_statement)

    for predicate, args in predicates:
        predicate = predicate.strip()
        args = [arg.strip() for arg in args.split(',')]  # remove whitespace from each argument
        abstract_args = ', '.join(['O'] * len(args))
        fol_statement = fol_statement.replace(f'{predicate}({", ".join(args)})', f'P({abstract_args})')

    return fol_statement

In [None]:
df['Transformed_FOL'] = df['FOL'].apply(transform_fol_statement)

In [None]:
df

Unnamed: 0,NL,FOL,Transformed_FOL
0,All people who regularly drink coffee are depe...,∀x (Drinks(x) → Dependent(x)),∀x (P(O) → P(O))
1,People either regularly drink coffee or joke a...,∀x (Drinks(x) ⊕ Jokes(x)),∀x (P(O) ⊕ P(O))
2,No one who jokes about being addicted to caffe...,∀x (Jokes(x) → ¬Unaware(x)),∀x (P(O) → ¬P(O))
3,Rina is either a student and unaware that caff...,(Student(rina) ∧ Unaware(rina)) ⊕ ¬(Student(ri...,(P(O) ∧ P(O)) ⊕ ¬(P(O) ∨ P(O))
4,If Rina is not a person dependent on caffeine ...,¬(Dependent(rina) ∧ Student(rina)) → (Dependen...,¬(P(O) ∧ P(O)) → (P(O) ∧ P(O)) ⊕ ¬(P(O) ∨ P(O))
...,...,...,...
2190,No one playing for Nautico is Brazilian.,"∀x (PlaysFor(x, nautico) → ¬Brazilian(x))","∀x (P(O, O) → ¬P(O))"
2191,Ailton Silva foes not play for a football club.,"∀x (FootballClub(x) → ¬PlaysFor(ailtonsilva, x))","∀x (P(O) → ¬P(O, O))"
2192,Ailton was not loaned out to a football club.,"∀x (FootballClub(x) → ¬LoanedTo(ailton, x))","∀x (P(O) → ¬P(O, O))"
2193,Ailton Silva played for Fluminense.,"PlaysFor(ailtonsilva, fluminense)","P(O, O)"


In [None]:
# Tokenize based on logical structure
def fol_tokenizer(text):
    tokens = re.findall(r'\w+|[∀∃¬∧∨→↔⊕(),]', text)
    return tokens

In [None]:
vectorizer = CountVectorizer(tokenizer=fol_tokenizer)
X = vectorizer.fit_transform(df['Transformed_FOL'])



In [None]:
kmeans = KMeans(n_clusters=7, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

In [None]:
sampled_triplets = []
for cluster in range(7):
    cluster_df = df[df['Cluster'] == cluster]
    sampled_triplet = cluster_df.sample(n=1)
    sampled_triplets.append(sampled_triplet)

In [None]:
sampled_triplets[0]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
8707,An architecturally interesting building featur...,∀x (Building(x) ∧ UniqueDesignElements(x) ∧ Vi...,∀x (P(O) ∧ P(O) ∧ P(O) ∧ P(O) → P(O)),0


In [None]:
sampled_triplets[1]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
19026,A successful marketing campaign reaches a wide...,∀x∀y∀z∀w (MarketingCampaign(x) ∧ Audience(y) ∧...,∀x∀y∀z∀w (P(O) ∧ P(O) ∧ P(O) ∧ P(O) ∧ P(O) ∧ P...,1


In [None]:
sampled_triplets[2]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
1366,"An activity is either indoor or outdoor, and n...",∀x (Activity(x) → (Indoor(x) ⊕ Outdoor(x))),∀x (P(O) → (P(O) ⊕ P(O))),2


In [None]:
sampled_triplets[3]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
24118,"A coat provides warmth, an umbrella protects f...",∀x ∀y ∀z (Coat(x) → ProvidesWarmth(x)) ∧ (Umbr...,∀x ∀y ∀z (P(O) → P(O)) ∧ (P(O) → P(O)) ∧ (P(O)...,3


In [None]:
sampled_triplets[4]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
24831,An engine x powers a vehicle y if x converts e...,∀x ∀y (Engine(x) ∧ Vehicle(y) ∧ ConvertsEnergy...,"∀x ∀y (P(O) ∧ P(O) ∧ P(O) ∧ P(O, O) → P(O, O))",4


In [None]:
sampled_triplets[5]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
16017,A telescope detects a celestial object if it h...,∀x (Telescope(x) ∧ PowerfulLens(x) ∧ ClearSkyC...,∀x (P(O) ∧ P(O) ∧ P(O) → P(O)),5


In [None]:
sampled_triplets[6]

Unnamed: 0,NL,FOL,Transformed_FOL,Cluster
14788,A game is addictive if it has challenging leve...,∀x (Game(x) ∧ ((ChallengingLevels(x) ∧ Impress...,∀x (P(O) ∧ ((P(O) ∧ P(O)) ∨ (P(O) ∧ P(O))) → P...,6


In [None]:
few_shots = [8707, 19026, 1366, 24118, 24831, 16017, 14788]
for idx in few_shots:
  print(df.iloc[idx][0])
  print(df.iloc[idx][1])
  print()

An architecturally interesting building features unique design elements, visually appealing materials, and innovative construction techniques.
∀x (Building(x) ∧ UniqueDesignElements(x) ∧ VisuallyAppealingMaterials(x) ∧ InnovativeConstructionTechniques(x) → ArchitecturallyInteresting(x))

A successful marketing campaign reaches a wide audience, generates brand awareness, and drives customer engagement.
∀x∀y∀z∀w (MarketingCampaign(x) ∧ Audience(y) ∧ BrandAwareness(z) ∧ CustomerEngagement(w) ∧ Wide(y) ∧ Generates(x, z) ∧ Drives(x, w) → Successful(x))

An activity is either indoor or outdoor, and not both.
∀x (Activity(x) → (Indoor(x) ⊕ Outdoor(x)))

A coat provides warmth, an umbrella protects from rain, and sunglasses shield from sunlight.
∀x ∀y ∀z (Coat(x) → ProvidesWarmth(x)) ∧ (Umbrella(y) → ProtectsFromRain(y)) ∧ (Sunglasses(z) → ShieldsFromSunlight(z))

An engine x powers a vehicle y if x converts energy into mechanical force that propels y.
∀x ∀y (Engine(x) ∧ Vehicle(y) ∧ ConvertsE

Advantages: Does a good job at identifying diversity of logical operators, number of predicates, and number of arguments.

Limitations: Not a great job of identifying diversity with universal v existential quantified statements as well as statements that neither. Manually adding that diversity through the code below.

In [None]:
existential_example_of_cluster_1 = df.iloc[df[df['FOL'].str.contains("∃") & (df['Cluster'] == 1)].sample(n=1).index[0]]
print(existential_example_of_cluster_1[0])
print(existential_example_of_cluster_1[1])


A rainbow appears when sunlight is refracted, reflected, and dispersed by water droplets in the atmosphere.
∀x (Rainbow(x) ↔ ∃y ∃z (Sunlight(y) ∧ WaterDroplets(z) ∧ Refracted(y, z) ∧ Reflected(y, z) ∧ Dispersed(y, z)))


In [None]:
no_variable_object_example_cluster_2 = df.iloc[df[~df['FOL'].str.contains('[∃∀]') & (df['Cluster'] == 2)].sample(n=1).index[0]]
print(no_variable_object_example_cluster_2[0])
print(no_variable_object_example_cluster_2[1])

A cake is ready to be served if it's cooled and the frosting is applied.
ReadyToServe(cake) ↔ (Cooled(cake) ∧ FrostingApplied(cake))


In [None]:
existential_example_of_cluster_4 = df.iloc[df[df['FOL'].str.contains("∃") & (df['Cluster'] == 4)].sample(n=1).index[0]]
print(existential_example_of_cluster_4[0])
print(existential_example_of_cluster_4[1])

Hospitals treat patients with medical professionals and appropriate equipment.
∀x∀y (Hospital(x) ∧ Patient(y) → ∃z∃w (MedicalProfessional(z) ∧ Equipment(w) ∧ TreatsWith(x, y, z, w)))


In [None]:
no_variable_object_example_cluster_5 = df.iloc[df[~df['FOL'].str.contains('[∃∀]') & (df['Cluster'] == 5)].sample(n=1).index[0]]
print(no_variable_object_example_cluster_5[0])
print(no_variable_object_example_cluster_5[1])

A dessert is delicious if it's sweet but not overly sweet and has a pleasant texture.
DeliciousDessert(x) ↔ (Sweet(x) ∧ ¬OverlySweet(x) ∧ PleasantTexture(x))
