# Chapter 01 - Load Data From CSV

In [1]:
# The reader() function is part of the CSV module
from csv import reader 

## Functions to load CSV files

#### First version of load_csv function. Bug: read empty lines

In [2]:
# Load a CSV file
def load_csv_v01(filename):
    # Open the file
    file = open(filename, "r") 
    # The function reader() reads the lines of the file
    lines = reader(file)
    # Transform the data in a list of lists
    # First list: lines of dataset
    # Second list: values for columns of dataset
    dataset = list(lines)
    return dataset

#### Test version 01

In [3]:
# Load dataset
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv_v01(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'
      .format(filename, len(dataset), len(dataset[0])))

Loaded data file ./data/pima-indians-diabetes.csv with 768 rows and 9 columns.


#### Second version of load_csv() function. Bug fixed

In [4]:
# Load a CSV file
def load_csv_v02(filename):
    # Create a list to store the rows of data
    dataset = list() 
    # With is used when we need close some resources after use
    # Equivalent to try...finally
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        # Iterate by the dataset rows
        for row in csv_reader:
            # If it's an empty row, ignore
            if not row:
                continue 
            # If it's not an empty row, add to a list
            dataset.append(row)
    return dataset

#### Test version 02

In [5]:
# Load dataset
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv_v02(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'
      .format(filename, len(dataset), len(dataset[0])))

Loaded data file ./data/pima-indians-diabetes.csv with 768 rows and 9 columns.


### Convert strings to float

* Machine learning algorithms prefer to work with floating point numbers.

In [6]:
# Example of row
print(dataset[0])

['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']


In [7]:
def str_column_to_float(dataset, column):
    # Iterate by all lines of the dataset
    for row in dataset:
        # The function strip guarentees that any whitespace
        # are eliminated
        row[column] = float(row[column].strip())

In [8]:
# Load dataset
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv_v02(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'
      .format(filename, len(dataset), len(dataset[0])))

print(dataset[0])

# Testing the str_column_to_float() function
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
    
print(dataset[0])

Loaded data file ./data/pima-indians-diabetes.csv with 768 rows and 9 columns.
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


### Convert string to integer

In [9]:
def str_column_to_int(dataset, column):
    # Get the label column
    class_values = [row[column] for row in dataset]
    # Use the set() functions to get the unique values
    # set() characteristic: unordered and without repetition
    unique = set(class_values)
    # Create a dict to store the int values for each class
    lookup = dict()
    # Iterate by the unique values and assign
    # an int value to each unique labe
    for i, value in enumerate(unique):
        lookup[value] = i
    # Iterate by each row in the dataset and convert 
    # the str value to corresponding int value 
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [10]:
# Load dataset
filename = './data/iris.csv'
dataset = load_csv_v02(filename)
print('Loaded data file {0} with {1} rows and {2} columns.'
      .format(filename, len(dataset), len(dataset[0])))

print(dataset[0])

# Testing the str_column_to_float() function
for i in range(4):
    str_column_to_float(dataset, i)
    
lookup = str_column_to_int(dataset, 4)
    
print(dataset[0])
print(lookup)

Loaded data file ./data/iris.csv with 150 rows and 5 columns.
['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 0]
{'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


### Future works

* Detect and remove empty lines at the top or bottom of the file.
* Detect and handle missing values in a column.
* Detect and handle rows that do not match expectations for the rest of the file.
* Support for other delimiters such as pipe (|) or white space.
* Support more efficient data structures such as arrays.

Libs to use in real world projects: pandas and numpy