### CSV Part 2
no pandas, no csv module

In [1]:
track_times = [
    [13.10, 13.59, 13.44],
    [13.93, 13.85, 13.47],
    [14.12, 14.41, 13.89],
    [14.42, 13.55, 13.43]
]
track_times

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

In [4]:
track_times_csv = ""

# Loop over all lists in the overall list. each row is a different athlete
# enumerate(index=athlete, time every lap (3laps))
# Join together the values in the nested list using a comma as a separator
# Append the values to the overall string, I am creating a string with ',' between every element

for index, athlete_times in enumerate(track_times):
    athlete_times_string = ",".join([str(time) for time in athlete_times])
    track_times_csv += athlete_times_string # Once, it's checked all the row of one Athlete, add them to the string
    if index < (len(track_times) - 1):      # Append a newline, unless this is the last row, so insert a line
        track_times_csv += "\n"

print(track_times_csv) # Without printing, it looks weird

13.1,13.59,13.44
13.93,13.85,13.47
14.12,14.41,13.89
14.42,13.55,13.43


In [5]:
# Storage the data in a csv file
# I don't need to import csv library !!

with open("track_times.csv", "w") as f:

  f.write(track_times_csv)

In [6]:
# Open the file

with open("track_times.csv") as f:
    track_times_csv_string_from_disk = f.read()
track_times_csv_string_from_disk

'13.1,13.59,13.44\n13.93,13.85,13.47\n14.12,14.41,13.89\n14.42,13.55,13.43'

In [7]:
# Let's do the reverse and get my origina data

track_times_from_disk = []

with open("track_times.csv") as f:      # f is the file object  
    for row in f:                       # iterate over list of lines. each row is a string.
        times = [float(time) for time in row.split(",")] # loop over f line by line, each row is a string representing a single line and split by ','
        track_times_from_disk.append(times)              # append to row (now a list of floats) to outer list
                                                         # Python can handle float('13.44\n') safely
   
track_times_from_disk

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

In [8]:
track_times_from_disk == track_times # True, all good

True

Summary 

| Step             | What it Does                      |
| ---------------- | --------------------------------- |
| `open()`         | Opens the CSV file for reading    |
| `for row in f`   | Reads line-by-line                |
| `row.split(",")` | Splits string into list           |
| `float(time)`    | Converts strings to float         |
| `append(times)`  | Adds list of floats to outer list |


More challenges:

data type checking and appropriate conversion with different data types in each column

processing headers (column names and metadata preceding the actual tabular data)

properly handling text data inside a CSV, e.g. if your data contains the text `"Hello, World!"` you want to make sure that the `,` is treated as part of the contents of that cell, not treated as a delimiter separating the columns

representing your data in different types of data structures (e.g., a list of dicts as opposed to a list of lists)

In [9]:
# Example with csv module, same result and les codes

import csv

with open("track_times.csv") as f:
    
# Pass the file in to a "reader" object and specify that
        # values without explicit quotes (i.e. all values in this
        # dataset) should be treated as numbers
        
# return reader object that can act as an iterator
# each element of the iterator contains a fully processed line as a list with types already converted
     reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) # field is quoted, treat it as a string. Not quoted, convert it to float
# Get all of the data from the reader using `list`
    
# list() explicitly converts iterator to a list
     track_times_with_csv_module = list(reader)
    
track_times_with_csv_module

[[13.1, 13.59, 13.44],
 [13.93, 13.85, 13.47],
 [14.12, 14.41, 13.89],
 [14.42, 13.55, 13.43]]

| Argument           | Description                                                |
| ------------------ | ---------------------------------------------------------- |
| `file`             | The file object you opened (e.g., `f = open(...)`)         |
| `dialect`          | Preset formatting rules like `"excel"` (default), `"unix"` |
| `delimiter`        | Character that separates values (default is `','`)         |
| `quotechar`        | Character used to quote fields (default is `'"'`)          |
| `quoting`          | Controls how quotes are interpreted                        |
| `skipinitialspace` | If `True`, skips space after delimiter                     |
| `strict`           | If `True`, raises error on malformed CSV rows              |


# Example with Dictionary

In [11]:
with open("olympic_medals.csv", encoding='utf-8') as f:
    reader = csv.reader(f)
    # Printing only the header and first 5 rows of data
    for _ in range(6):
        print(next(reader))

['Gender', 'Event', 'Location', 'Year', 'Medal', 'Name', 'Nationality', 'Result']
['M', '10000M Men', 'Rio', '2016', 'G', 'Mohamed FARAH', 'GBR', '25:05.17']
['M', '10000M Men', 'Rio', '2016', 'S', 'Paul Kipngetich TANUI', 'KEN', '27:05.64']
['M', '10000M Men', 'Rio', '2016', 'B', 'Tamirat TOLA', 'ETH', '27:06.26']
['M', '10000M Men', 'Beijing', '2008', 'G', 'Kenenisa BEKELE', 'ETH', '27:01.17']
['M', '10000M Men', 'Beijing', '2008', 'S', 'Sileshi SIHINE', 'ETH', '27:02.77']


In [13]:
with open("olympic_medals.csv", encoding='utf-8') as f:

    # pass the file object into the DictReader instead of the reader
    reader = csv.DictReader(f) # creates an iterable
    # now convert to list
    olympics_data = list(reader)

# Print the first 5 rows of data
for index in range(5):
    print(olympics_data[index])

{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'G', 'Name': 'Mohamed FARAH', 'Nationality': 'GBR', 'Result': '25:05.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'S', 'Name': 'Paul Kipngetich TANUI', 'Nationality': 'KEN', 'Result': '27:05.64'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Rio', 'Year': '2016', 'Medal': 'B', 'Name': 'Tamirat TOLA', 'Nationality': 'ETH', 'Result': '27:06.26'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'G', 'Name': 'Kenenisa BEKELE', 'Nationality': 'ETH', 'Result': '27:01.17'}
{'Gender': 'M', 'Event': '10000M Men', 'Location': 'Beijing', 'Year': '2008', 'Medal': 'S', 'Name': 'Sileshi SIHINE', 'Nationality': 'ETH', 'Result': '27:02.77'}


In [14]:
# Filter by golde medals

gold_medals = []

for row in olympics_data:
    if row["Medal"] == "G":
        gold_medals.append(row)
        
print(f"""Out of {len(olympics_data)} total medals, this dataset 
contains information about {len(gold_medals)} gold medals""")

Out of 2394 total medals, this dataset 
contains information about 799 gold medals


In [15]:
# Filter by USA, gold medals in 2016

usa_2016_gold_medals = []

for row in olympics_data:
    if row["Medal"] == "G" and row["Nationality"] == "USA" and row["Year"] == "2016":
        usa_2016_gold_medals.append({"Event": row["Event"], "Name": row["Name"]})
usa_2016_gold_medals

[{'Event': '1500M Men', 'Name': 'Matthew CENTROWITZ'},
 {'Event': '400M Hurdles Men', 'Name': 'Kerron CLEMENT'},
 {'Event': '4X400M Relay Men', 'Name': 'null'},
 {'Event': 'Decathlon Men', 'Name': 'Ashton EATON'},
 {'Event': 'Long Jump Men', 'Name': 'Jeff HENDERSON'},
 {'Event': 'Shot Put Men', 'Name': 'Ryan CROUSER'},
 {'Event': 'Triple Jump Men', 'Name': 'Christian TAYLOR'},
 {'Event': '100M Hurdles Women', 'Name': 'Brianna ROLLINS'},
 {'Event': '400M Hurdles Women', 'Name': 'Dalilah MUHAMMAD'},
 {'Event': '4X100M Relay Women', 'Name': 'null'},
 {'Event': '4X400M Relay Women', 'Name': 'null'},
 {'Event': 'Long Jump Women', 'Name': 'Tianna BARTOLETTA'},
 {'Event': 'Shot Put Women', 'Name': 'Michelle CARTER'}]

In [17]:
# Let's creat a file with the filter

with open("usa_2016_gold_medals.csv", "w", newline='') as f: # without newline='', it will create a extra line between rows
    writer = csv.DictWriter(f, fieldnames=["Event", "Name"])
    writer.writeheader()
    for row in usa_2016_gold_medals:
        writer.writerow(row)