In [47]:
import numpy as np
import csv

class Dataset:
    def __init__(self, X=None, y=None, feature_names=None, label_name=None):
        self.X = X
        self.y = y
        self.feature_names = feature_names
        self.label_name = label_name
        self.discrete_values = {}

    def get_X(self):
        return self.X

    def get_y(self):
        return self.y

    def get_feature_names(self):
        return self.feature_names

    def get_label_name(self):
        return self.label_name

    def set_X(self, X):
        self.X = X

    def set_y(self, y):
        self.y = y

    def set_feature_names(self, feature_names):
        self.feature_names = feature_names

    def set_label_name(self, label_name):
        self.label_name = label_name

    def read_csv(self, filepath, delimiter=',', has_header=True):
        with open(filepath, 'r') as file:
            reader = csv.reader(file, delimiter=delimiter)
            if has_header:
                header = next(reader)
                self.feature_names = header[:-1]
                self.label_name = header[-1]
            data = list(reader)
        data = np.array(data, dtype=object)
        self.X = data[:, :-1].astype(np.float64)
        self.y = np.round(data[:, -1].astype(np.float64)).astype(np.int64)

    def write_csv(self, filepath, delimiter=','):
        data = np.column_stack((self.X, self.y))
        with open(filepath, 'w') as file:
            writer = csv.writer(file, delimiter=delimiter)
            if self.feature_names is not None and self.label_name is not None:
                writer.writerow(self.feature_names + [self.label_name])
            writer.writerows(data)

    def describe(self):
        means = np.mean(self.X.astype(np.float64), axis=0)
        stds = np.std(self.X.astype(np.float64), axis=0)
        min_values = np.min(self.X.astype(np.float64), axis=0)
        max_values = np.max(self.X.astype(np.float64), axis=0)

        return means, stds, min_values, max_values

    def find_missing_values(self):
        return np.isnan(self.X.astype(np.float64))

    def count_missing_values(self):
        return np.sum(np.isnan(self.X.astype(np.float64)), axis=0)

    def replace_missing_values(self, constant='mean'):
        missing = self.find_missing_values()
        for i in range(self.X.shape[1]):
            if constant == 'mean':
                replacement = np.nanmean(self.X[:, i].astype(np.float64))
            elif constant == 'median':
                replacement = np.nanmedian(self.X[:, i].astype(np.float64))
            else:
                replacement = constant
            self.X[:, i][missing[:, i]] = replacement

In [48]:
from sklearn import datasets

# Load the iris dataset
iris = datasets.load_iris()

# Extract the feature names and label name
feature_names = iris.feature_names
label_name = "species"

# Create an instance of the Dataset class with the iris data
dataset = Dataset(X=iris.data, y=iris.target, feature_names=feature_names, label_name=label_name)

# Print some basic statistics
means, stds, min_values, max_values = dataset.describe()
print("Means:", means)
print("Standard deviations:", stds)
print("Minimum values:", min_values)
print("Maximum values:", max_values)

# Find and count missing values
missing_values = dataset.find_missing_values()
print("Missing values:", missing_values)
missing_count = dataset.count_missing_values()
print("Missing values count:", missing_count)

# Write the dataset to a CSV file
dataset.write_csv("iris.csv")

# Read the dataset from the CSV file
dataset_from_csv = Dataset()
dataset_from_csv.read_csv("iris.csv", delimiter=',', has_header=True)

# Print some statistics from the dataset read from the CSV file
means, stds, min_values, max_values = dataset_from_csv.describe()
print("Means (from CSV):", means)
print("Standard deviations (from CSV):", stds)
print("Minimum values (from CSV):", min_values)
print("Maximum values (from CSV):", max_values)

Means: [5.84333333 3.05733333 3.758      1.19933333]
Standard deviations: [0.82530129 0.43441097 1.75940407 0.75969263]
Minimum values: [4.3 2.  1.  0.1]
Maximum values: [7.9 4.4 6.9 2.5]
Missing values: [[False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False False False]
 [False False

In [46]:
import unittest
import numpy as np
from sklearn import datasets
import os
import tempfile

class TestDataset(unittest.TestCase):

    def setUp(self):
        iris = datasets.load_iris()
        self.dataset = Dataset(X=iris.data, y=iris.target, feature_names=iris.feature_names, label_name="species")

    def test_getters_and_setters(self):
        X = self.dataset.get_X()
        y = self.dataset.get_y()
        feature_names = self.dataset.get_feature_names()
        label_name = self.dataset.get_label_name()

        self.assertIsNotNone(X)
        self.assertIsNotNone(y)
        self.assertIsNotNone(feature_names)
        self.assertIsNotNone(label_name)

        self.dataset.set_X(np.array([[1, 2], [3, 4]]))
        self.dataset.set_y(np.array([1, 0]))
        self.dataset.set_feature_names(['a', 'b'])
        self.dataset.set_label_name('c')

        self.assertTrue(np.array_equal(self.dataset.get_X(), np.array([[1, 2], [3, 4]])))
        self.assertTrue(np.array_equal(self.dataset.get_y(), np.array([1, 0])))
        self.assertEqual(self.dataset.get_feature_names(), ['a', 'b'])
        self.assertEqual(self.dataset.get_label_name(), 'c')

    def test_read_write_csv(self):
        self.dataset.write_csv("test_iris.csv")
        dataset_from_csv = Dataset()
        dataset_from_csv.read_csv("test_iris.csv", delimiter=',', has_header=True)

        self.assertTrue(np.array_equal(self.dataset.get_X(), dataset_from_csv.get_X()))
        self.assertTrue(np.array_equal(self.dataset.get_y(), dataset_from_csv.get_y()))
        self.assertEqual(self.dataset.get_feature_names(), dataset_from_csv.get_feature_names())
        self.assertEqual(self.dataset.get_label_name(), dataset_from_csv.get_label_name())

    def test_write_csv(self):
        # Create a temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_filepath = temp_file.name

        # Write the dataset to the temporary file
        self.dataset.write_csv(temp_filepath)

        # Read the dataset from the temporary file
        dataset_from_csv = Dataset()
        dataset_from_csv.read_csv(temp_filepath, delimiter=',', has_header=True)

        # Compare the read dataset with the original dataset
        self.assertTrue(np.array_equal(self.dataset.get_X(), dataset_from_csv.get_X()))
        self.assertTrue(np.array_equal(self.dataset.get_y(), dataset_from_csv.get_y()))
        self.assertEqual(self.dataset.get_feature_names(), dataset_from_csv.get_feature_names())
        self.assertEqual(self.dataset.get_label_name(), dataset_from_csv.get_label_name())

        # Remove the temporary file
        os.remove(temp_filepath)

    def test_describe(self):
        means, stds, min_values, max_values = self.dataset.describe()

        self.assertIsNotNone(means)
        self.assertIsNotNone(stds)
        self.assertIsNotNone(min_values)
        self.assertIsNotNone(max_values)

    def test_missing_values(self):
        missing_values = self.dataset.find_missing_values()
        self.assertIsNotNone(missing_values)
        self.assertEqual(missing_values.shape, self.dataset.get_X().shape)

        missing_count = self.dataset.count_missing_values()
        self.assertIsNotNone(missing_count)
        self.assertEqual(missing_count.shape, (self.dataset.get_X().shape[1],))

    def test_replace_missing_values(self):
        self.dataset.replace_missing_values(constant='mean')
        missing_values = self.dataset.find_missing_values()
        self.assertFalse(np.any(missing_values))

    def test_find_missing_values(self):
        # Create a dataset with some missing values
        X = np.array([[1, np.nan, 3], [4, 5, np.nan], [7, 8, 9]])
        y = np.array([0, 1, 2])
        feature_names = ['a', 'b', 'c']
        label_name = 'd'
        dataset_with_missing_values = Dataset(X=X, y=y, feature_names=feature_names, label_name=label_name)

        # Find missing values
        missing_values = dataset_with_missing_values.find_missing_values()

        # Check if the missing values are correctly identified
        expected_missing_values = np.array([[False, True, False], [False, False, True], [False, False, False]])
        self.assertTrue(np.array_equal(missing_values, expected_missing_values))

    def test_count_missing_values(self):
        # Create a dataset with some missing values
        X = np.array([[1, np.nan, 3], [4, 5, np.nan], [7, 8, 9]])
        y = np.array([0, 1, 2])
        feature_names = ['a', 'b', 'c']
        label_name = 'd'
        dataset_with_missing_values = Dataset(X=X, y=y, feature_names=feature_names, label_name=label_name)

        # Count missing values
        missing_count = dataset_with_missing_values.count_missing_values()

        # Check if the count of missing values is correct
        expected_missing_count = np.array([0, 1, 1])
        self.assertTrue(np.array_equal(missing_count, expected_missing_count))

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

........
----------------------------------------------------------------------
Ran 8 tests in 0.028s

OK
