#### Required Modules

In [9]:
import os
import pandas as pd
from scipy.io import arff

from typing import Tuple

#### Load Datasets

In [10]:
# Expect a `data` directory
DATA_DIRECTORY = "./data/"

def load_from_arff(filename: str) -> pd.DataFrame:
    """
    Load ARFF data and build it into a Pandas DataFrame.
    """
    data, _ = arff.loadarff(DATA_DIRECTORY + filename)
    df = pd.DataFrame(data)
    return df

#### Dataset Selection

| Dataset | Description | Issues | Selected |
|---------|-------------|--------|----------|
| Poisonous Mushrooms | Manually compiled features of forages mushrooms. | No pre-processing required. | No |
| Airline Departure Delays | Time and location of flight departures. | No pre-processing required. | No |
| Mortgage Application Outcomes | Applicant information, application process results | All data is numeric with not keys for categories. Some categorical data is type `float`. | No |
| Wine Reviews | Reviews and ratings of wines from a specified location. | ARFF has format issues in `str` column. | Yes |

#### Wine Review Data

Manually reformatted file by changing ARFF features into a CSV column name row.

In [None]:
WINE_ARFF = "wine.arff"
WINE_CSV = "wine.csv"

# df = load_from_arff(WINE_DATA)
df = pd.read_csv(DATA_DIRECTORY + WINE_CSV,
    sep=",",
    quotechar="'",
    escapechar="\\",
    engine="python")
df.head()

Unnamed: 0,country,description,points,price,province,variety
0,US,"This is a tight, black-fruited wine, with gene...",92,28.0,Washington,Cabernet Franc
1,US,A nice starter Pinot for the price. Will intro...,86,14.0,California,Pinot Noir
2,Spain,Raw aromas of rhubarb and scratchy raspberry o...,83,15.0,Northern Spain,Tempranillo
3,Germany,This wine comes out of the bottle a little cra...,90,18.0,Mosel-Saar-Ruwer,Riesling
4,South Africa,Winemaker: Jacques Borman. There's a ton of he...,90,?,Stellenbosch,Shiraz
