# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


# Data Loader Class Definition

In [2]:
class DataLoader:
    """
    Class to load and inspect a pipe-separated dataset.
    """
    def __init__(self, filepath, sep='|', parse_dates=None):
        """
        filepath: path to the dataset file
        sep: separator (default '|')
        parse_dates: list of columns to parse as dates
        """
        self.filepath = filepath
        self.sep = sep
        self.parse_dates = parse_dates
        self.df = None

    def load_data(self):
        """
        Load the dataset into a pandas DataFrame.
        """
        try:
            self.df = pd.read_csv(self.filepath, sep=self.sep, parse_dates=self.parse_dates, low_memory=False)
            print(f"Data loaded: {self.df.shape[0]} rows × {self.df.shape[1]} columns.")
        except Exception as e:
            print(f"Error loading data: {e}")
        return self.df

    def get_shape(self):
        """Return tuple (n_rows, n_cols) or None if not loaded."""
        if self.df is not None:
            return self.df.shape
        print("Data not loaded yet.")
        return None

    def get_dtypes(self):
        """Return data types of columns or None if not loaded."""
        if self.df is not None:
            return self.df.dtypes
        print("Data not loaded yet.")
        return None

    def head(self, n=5):
        """Return first n rows or None if not loaded."""
        if self.df is not None:
            return self.df.head(n)
        print("Data not loaded yet.")
        return None

    def missing_summary(self):
        """Return missing counts per column sorted descending, or None."""
        if self.df is not None:
            return self.df.isna().sum().sort_values(ascending=False)
        print("Data not loaded yet.")
        return None

    def describe_numeric(self):
        """Return descriptive stats for numeric columns, or None."""
        if self.df is not None:
            return self.df.describe().T
        print("Data not loaded yet.")
        return None


# Load Data

In [3]:
filepath = '../data/raw/MachineLearningRating_v3.txt'
loader = DataLoader(filepath, sep='|', parse_dates=['TransactionMonth'])
df = loader.load_data()

# Basic inspections
print("Shape:", loader.get_shape())
print("\nColumn dtypes:\n", loader.get_dtypes())
print("\nFirst rows:\n", loader.head(5))
print("\nMissing values (top 10):\n", loader.missing_summary().head(10))
print("\nNumeric summary:\n", loader.describe_numeric().head(10))


Data loaded: 1000098 rows × 52 columns.
Shape: (1000098, 52)

Column dtypes:
 UnderwrittenCoverID                  int64
PolicyID                             int64
TransactionMonth            datetime64[ns]
IsVATRegistered                       bool
Citizenship                         object
LegalType                           object
Title                               object
Language                            object
Bank                                object
AccountType                         object
MaritalStatus                       object
Gender                              object
Country                             object
Province                            object
PostalCode                           int64
MainCrestaZone                      object
SubCrestaZone                       object
ItemType                            object
mmcode                             float64
VehicleType                         object
RegistrationYear                     int64
make               