In [1]:
from csv import reader

In [7]:
#Function for loading a CSV.
def load_csv(filename):
    file = open(filename, "r")
    lines = reader(file)
    dataset = list(lines)
    return dataset


In [10]:
# Load dataset
filename='pima-indians-diabetes.csv'
dataset  =load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
                                                                  len(dataset[0])))

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns


In [11]:
# Limitation of the above function is that it will load empty lines from data files
# and add to our list of rows. we can overcome this by adding rows of data one at a time to our dataset
# and skipping empty rows.

In [14]:
# Example of loading pima indians csv dataset
from csv import reader

# load a csv file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset


# Load dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
                                                                  len(dataset[0])))

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns


# Basic Data conversion

## Converting String to Floats

In [15]:
# Our code for loading a csv file returns a dataset as a list of lists, 
# but each value is a string

In [16]:
print(dataset[0])

['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']


In [18]:
# A small function to convert specific columns of our loaded dataset to floating point values.
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip()) #careful to strip any whitespace from the value before making the conversion

In [19]:
# Example of loading pima indians csv dataset
from csv import reader

# load a csv file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# coverting string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
        
        
        
# Load dataset    
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
                                                                  len(dataset[0])))
print(dataset[0])

# converting string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

print(dataset[0])

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns
['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]


## Converting Strings to Integers

##### Some machine learning algorithms prefer all values to be numeric, including the outcome or predicted value.

#### Steps:
<ol> <li> First, we locate all of the unique class values, which happen to be: 
    <ul><li>Iris-setosa</li>
        <li>Iris-versicolor</li>
        <li>Iris-virginica</li></ul>
    <li>Next, we assign an integer value to weac, such as: 0, 1 and 2.</li>
    <li> Finally, we replace all occurrences of class string values with their corresponding integer values</li>

In [21]:
# Just like previous function str_column_float(), function str_column_int()
# it also operates on a single column in the dataset.

In [27]:
# Example of loading pima indians csv dataset
from csv import reader

# load a csv file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# coverting string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
        
        
# coverting string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

        
        
        
# Load dataset    
filename = 'iris.data'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
                                                                  len(dataset[0])))
print(dataset[0])

# converting string columns to float
for i in range(4):
    str_column_to_float(dataset, i)
    
# convert class column to int
lookup = str_column_to_int(dataset, 4)
print(dataset[0])

print(dataset[0])
print(lookup)

Loaded data file iris.data with 150 rows and 5 columns
['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 1]
[5.1, 3.5, 1.4, 0.2, 1]
{'Iris-virginica': 0, 'Iris-setosa': 1, 'Iris-versicolor': 2}
