## Flujo de trabajo de un proyecto de Machine Learning

# 1. Conseguir y cargar datos

## Conseguir datos:
1. Alguien nos entrega los datos (YAY!)
2. Datasets públicos: 
    - [scikit learn](https://scikit-learn.org/stable/datasets/index.html)
    - [Kaggle](https://www.kaggle.com/)
    - [UCI Machine learning Repository](https://archive.ics.uci.edu/ml/datasets.html)
3. Internet of Things
4. Web crawling: [Scrapy](https://scrapy.org/)

# Cargar datos
- Texto
    - CSV
    - JSON
    - XML
- Imagen
    - Pillow
    - OpenCV
- Audio
    - Wavio
    - PyAudio

## CSV

In [2]:
# Usando una lista de valores
import csv

with open('datasets/some.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        print(row)

['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address']
['1', 'Brennan', 'Blesli', 'bblesli0@theatlantic.com', 'Non-binary', '94.202.160.87']
['2', 'Abraham', 'Engel', 'aengel1@techcrunch.com', 'Agender', '106.178.90.215']
['3', 'Skipp', 'Dawtrey', 'sdawtrey2@scientificamerican.com', 'Female', '96.142.23.72']
['4', 'Cynde', 'Vivyan', 'cvivyan3@quantcast.com', 'Agender', '204.13.116.129']
['5', 'Johnnie', 'Ragbourne', 'jragbourne4@yale.edu', 'Agender', '143.90.146.248']
['6', 'Thorny', 'Conroy', 'tconroy5@examiner.com', 'Male', '86.203.62.141']
['7', 'Franni', 'Perroni', 'fperroni6@nhs.uk', 'Genderqueer', '128.243.103.50']
['8', 'Layla', 'Menco', 'lmenco7@bbb.org', 'Genderqueer', '66.106.194.116']
['9', 'Zachary', 'Sleightholm', 'zsleightholm8@bloglovin.com', 'Male', '207.69.5.19']
['10', 'Stephanie', 'Yankov', 'syankov9@theguardian.com', 'Non-binary', '169.163.50.147']


In [4]:
# Obteniendo un diccionario {nombre_campo: valor_campo}
import csv

with open('datasets/some.csv', mode='r') as csv_file: 
    csv_reader = csv.DictReader(csv_file)

    for row in csv_reader:
        print(row)

{'id': '1', 'first_name': 'Brennan', 'last_name': 'Blesli', 'email': 'bblesli0@theatlantic.com', 'gender': 'Non-binary', 'ip_address': '94.202.160.87'}
{'id': '2', 'first_name': 'Abraham', 'last_name': 'Engel', 'email': 'aengel1@techcrunch.com', 'gender': 'Agender', 'ip_address': '106.178.90.215'}
{'id': '3', 'first_name': 'Skipp', 'last_name': 'Dawtrey', 'email': 'sdawtrey2@scientificamerican.com', 'gender': 'Female', 'ip_address': '96.142.23.72'}
{'id': '4', 'first_name': 'Cynde', 'last_name': 'Vivyan', 'email': 'cvivyan3@quantcast.com', 'gender': 'Agender', 'ip_address': '204.13.116.129'}
{'id': '5', 'first_name': 'Johnnie', 'last_name': 'Ragbourne', 'email': 'jragbourne4@yale.edu', 'gender': 'Agender', 'ip_address': '143.90.146.248'}
{'id': '6', 'first_name': 'Thorny', 'last_name': 'Conroy', 'email': 'tconroy5@examiner.com', 'gender': 'Male', 'ip_address': '86.203.62.141'}
{'id': '7', 'first_name': 'Franni', 'last_name': 'Perroni', 'email': 'fperroni6@nhs.uk', 'gender': 'Genderquee

In [None]:
# crea la carpeta .eoi_solutions si no existe para guardar los ficheros generados
import os
if not os.path.exists('.eoi_solutions'):
    os.makedirs('.eoi_solutions')

In [5]:
# Escribir a partir de una lista de valores
import csv

with open('.eoi_solutions/employee.csv', mode='w') as csv_file:
    employee_writer = csv.writer(csv_file)

    employee_writer.writerow(['John Smith', 'Accounting', 'November'])
    employee_writer.writerow(['Erica Meyers', 'IT', 'March'])

In [None]:
# Escribir a partir de un diccionario {nombre_campo: valor_campo}
import csv

with open('.eoi_solutions/employee2.csv', mode='w') as csv_file:
    fieldnames = ['emp_name', 'dept', 'birth_month']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerow({'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'})
    writer.writerow({'birth_month': 'March', 'dept': 'IT', 'emp_name': 'Erica Meyers'})

## JSON

In [6]:
import json
from pprint import pprint

# read file
with open('datasets/some.json', 'r') as json_file:
    json_content = data = json.load(json_file)
    pprint(json_content)


[{'email': 'avignaux0@unc.edu',
  'first_name': 'Anne',
  'gender': 'Genderqueer',
  'id': 1,
  'ip_address': '63.46.253.42',
  'last_name': 'Vignaux'},
 {'email': 'byepiskov1@fc2.com',
  'first_name': 'Brittani',
  'gender': 'Bigender',
  'id': 2,
  'ip_address': '149.228.251.148',
  'last_name': 'Yepiskov'},
 {'email': 'midenden2@patch.com',
  'first_name': 'Mirna',
  'gender': 'Male',
  'id': 3,
  'ip_address': '56.78.154.172',
  'last_name': 'Idenden'},
 {'email': 'ylicquorish3@freewebs.com',
  'first_name': 'Yvor',
  'gender': 'Female',
  'id': 4,
  'ip_address': '99.105.249.213',
  'last_name': 'Licquorish'},
 {'email': 'scrannell4@marketwatch.com',
  'first_name': 'Sayre',
  'gender': 'Genderqueer',
  'id': 5,
  'ip_address': '115.126.112.163',
  'last_name': 'Crannell'},
 {'email': 'jcollacombe5@springer.com',
  'first_name': 'Jacquenetta',
  'gender': 'Non-binary',
  'id': 6,
  'ip_address': '182.166.87.162',
  'last_name': 'Collacombe'},
 {'email': 'mkrahl6@myspace.com',
  'f

In [7]:
import json

json_content = [
    {'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'},
    {'birth_month': 'March', 'dept': 'IT', 'emp_name': 'Erica Meyers'}
]

with open(".eoi_solutions/employee.json", "w") as write_file:
    json.dump(json_content, write_file)

## XML (deprecado)

In [8]:
import xml.etree.ElementTree as ET

tree = ET.parse('datasets/some.xml')
root = tree.getroot()
print(root.tag, root.attrib)

for child in root:
    print(child.tag, child.attrib)

data {}
country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [None]:
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

In [None]:
for country in root.findall('country'):
    rank = country.find('rank').text
    name = country.get('name')
    print(name, rank)

In [None]:
from xml.etree.ElementTree import Element, tostring
  
dict_content = [
    {'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'},
    {'birth_month': 'March', 'dept': 'IT', 'emp_name': 'Erica Meyers'}
]

xml_content = ''

for row in dict_content:
    elem = Element('employee')
    
    for key, val in row.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    
    xml_content += str(tostring(elem).decode("utf-8"))

with open(".eoi_solutions/employee.xml", "w") as xml_file:
    xml_file.write(f'<?xml version="1.0"?><data>{xml_content}</data>')

## Pillow

## OpenCV 

## wavio 

##  pyaudio