In [1]:
import pandas as pd

df = pd.read_csv("../data/Titanic-Dataset.csv")

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
import numpy as np

def detect_errors(df):
    errors = pd.DataFrame(index=df.index)

    # Missing values
    errors['missing_age'] = df['Age'].isnull()
    errors['missing_cabin'] = df['Cabin'].isnull()
    errors['missing_embarked'] = df['Embarked'].isnull()

    # Invalid 'Sex' values
    errors['invalid_sex'] = ~df['Sex'].isin(['male', 'female'])

    # Invalid 'Pclass' values (should be 1, 2, or 3)
    errors['invalid_pclass'] = ~df['Pclass'].isin([1, 2, 3])

    # Outlier detection in Fare (Z-score > 3)
    z_fare = (df['Fare'] - df['Fare'].mean()) / df['Fare'].std()
    errors['fare_outlier'] = z_fare.abs() > 3

    return errors

In [4]:
errors = detect_errors(df)
errors.sum()  


missing_age         177
missing_cabin       687
missing_embarked      2
invalid_sex           0
invalid_pclass        0
fare_outlier         20
dtype: int64

In [5]:
def repair_data(df):
    df_clean = df.copy()

    # Repair missing Age — use median grouped by Sex and Pclass
    df_clean['Age'] = df_clean.groupby(['Sex', 'Pclass'])['Age'].transform(
        lambda x: x.fillna(x.median())
    )

    # Fill missing Embarked with the most common value
    df_clean['Embarked'] = df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0])

    # Handle Cabin — mark missing ones as 'Unknown'
    df_clean['Cabin'] = df_clean['Cabin'].fillna('Unknown')

    # Cap Fare outliers at 95th percentile
    fare_cap = df_clean['Fare'].quantile(0.95)
    df_clean['Fare'] = np.where(df_clean['Fare'] > fare_cap, fare_cap, df_clean['Fare'])

    return df_clean

In [6]:
df_clean = repair_data(df)

# Check if issues were fixed
df_clean.isnull().sum()


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [7]:
# Summary statistics before vs after cleaning
df[['Age', 'Fare']].describe(), df_clean[['Age', 'Fare']].describe()


(              Age        Fare
 count  714.000000  891.000000
 mean    29.699118   32.204208
 std     14.526497   49.693429
 min      0.420000    0.000000
 25%     20.125000    7.910400
 50%     28.000000   14.454200
 75%     38.000000   31.000000
 max     80.000000  512.329200,
               Age        Fare
 count  891.000000  891.000000
 mean    29.112424   27.720486
 std     13.304424   29.225083
 min      0.420000    0.000000
 25%     21.500000    7.910400
 50%     26.000000   14.454200
 75%     36.000000   31.000000
 max     80.000000  112.079150)

In [9]:
import pandas as pd
import os

# Load your dataset
df = pd.read_csv("../data/Titanic-Dataset.csv")  # adjust path if needed

# Create output folder if it doesn't exist
os.makedirs("mln", exist_ok=True)

# ---------------------------
# 1. rules.mln – your MLN logic
# ---------------------------
mln_rules = """
// Predicate declarations
Sex(person, gender)
Pclass(person, class)
MissingAge(person)
HasAge(person, age)

// Soft rules (example weights, tweak as needed)
MissingAge(x) ^ Sex(x, female) ^ Pclass(x, 1) => HasAge(x, 35.0) ^0.9
MissingAge(x) ^ Sex(x, female) ^ Pclass(x, 3) => HasAge(x, 22.0) ^0.8
MissingAge(x) ^ Sex(x, male) ^ Pclass(x, 1) => HasAge(x, 40.0) ^0.85
MissingAge(x) ^ Sex(x, male) ^ Pclass(x, 3) => HasAge(x, 25.0) ^0.7
"""

with open("mln/rules.mln", "w") as f:
    f.write(mln_rules.strip())

# ---------------------------
# 2. facts.db – extracted facts
# ---------------------------
with open("mln/facts.db", "w") as f:
    for i, row in df.iterrows():
        pid = f"p{i}"
        f.write(f"Sex({pid}, {row['Sex']})\n")
        f.write(f"Pclass({pid}, {int(row['Pclass'])})\n")
        if pd.isnull(row['Age']):
            f.write(f"MissingAge({pid})\n")
        else:
            f.write(f"HasAge({pid}, {row['Age']})\n")

# ---------------------------
# 3. query.db – what you want Tuffy to infer
# ---------------------------
with open("mln/query.db", "w") as f:
    for i, row in df.iterrows():
        if pd.isnull(row['Age']):
            pid = f"p{i}"
            f.write(f"HasAge({pid}, a)\n")
