# Pizza: Cleaning up the raw yeast datasets

## Setting up

In [1]:
import os

CD_KEY = "--DEMO01_IN_ROOT"

In [2]:
if CD_KEY not in os.environ:
    os.environ[CD_KEY] = "false"

In [3]:
if (
    CD_KEY not in os.environ
    or os.environ[CD_KEY] is None
    or len(os.environ[CD_KEY]) == 0
    or os.environ[CD_KEY] == "false"
):
    %cd ..
else:
    print(os.getcwd())
    
os.environ[CD_KEY] = "true"

/mnt/data/projekti/osobno/pizza


### Packages

In [4]:
import csv

from src.yeast_prediction.preprocessing.dataset import (
    preprocess_raw_yeast_dataset
)

### Constants

In [5]:
source_file_paths = (
    "data/yeast_prediction/20130807_yeast-raw.tsv",
    "data/yeast_prediction/20180110_yeast-raw.tsv"
)
source_delimiters = (
    "\t",
    "\t"
)

destination_file_paths = (
    "data/yeast_prediction/20130807_yeast-preprocessed.tsv",
    "data/yeast_prediction/20180110_yeast-preprocessed.tsv"
)
destination_delimiters = (
    "\t",
    "\t"
)

## Functionality

Be careful: this will overwrite any existing files at the destination.

In [6]:
for (
    source_file_path,
    source_delimiter,
    destination_file_path,
    destination_delimiter
) in zip(
    source_file_paths,
    source_delimiters,
    destination_file_paths,
    destination_delimiters
):
    new_rows = preprocess_raw_yeast_dataset(
        source_file_path, delimiter=source_delimiter
    )

    with open(destination_file_path, mode="w+") as file:
        writer = csv.writer(file, delimiter=destination_delimiter)
        writer.writerows(new_rows)

## Examples

In [7]:
for destination_file_path, delimiter in zip(
    destination_file_paths, destination_delimiters
):
    print(f"First 5 entries of {destination_file_path}:")
    
    with open(destination_file_path) as file:
        reader = csv.reader(
            file, delimiter=destination_delimiter
        )
        
        for i in range(5):
            try:
                print(f"\t{next(reader)}")
            except StopIteration:
                break
                
        print()

First 5 entries of data/yeast_prediction/20130807_yeast-preprocessed.tsv:
	['temperature_kelvin', 'cake_yeast_percentage', 'fermentation_hours']
	['274.81666666666666', '0.01', '2731']
	['274.81666666666666', '0.02', '1357']
	['274.81666666666666', '0.03', '902']
	['274.81666666666666', '0.05', '538']

First 5 entries of data/yeast_prediction/20180110_yeast-preprocessed.tsv:
	['temperature_kelvin', 'cake_yeast_percentage', 'fermentation_hours']
	['274.81666666666666', '0.3', '167']
	['274.81666666666666', '0.4', '136']
	['274.81666666666666', '0.5', '115']
	['274.81666666666666', '0.6', '101']

