# 07 Test Train Split

Data is split into a set for model training and a set for validation. 
This prevents the model from being validated with data it was trained with to minimize overfitting.

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit

In [9]:
int_spectra = pd.read_csv('./training_data/06_int_spectra.csv', header=None)
fit_params_standard = pd.read_csv('./training_data/06_fit_params_standard.csv')
elements = pd.read_csv('./training_data/06_elements.csv')

# fit and resize spectra are different lengths so they can't be put in a dataframe
with open('./training_data/06_fit_spectra.csv') as f:
    fit_spectra = f.readlines()
with open('./training_data/06_resize_spectra.csv') as f:
    resize_spectra = f.readlines()

In [10]:
print(len(int_spectra))
print(len(fit_spectra))
print(len(resize_spectra))
print(len(elements))
print(len(fit_params_standard))
# Sanity check all are the same length

33247
33247
33247
33247
33247


In [11]:
x = int_spectra
y = fit_params_standard
shuffle_split = ShuffleSplit(n_splits=1, test_size=0.20)
shuffle_split.get_n_splits(x, y)
train_index, test_index = next(shuffle_split.split(x, y)) 

In [12]:
filename = './training_data/07_train_int_spectra.csv'
with open(filename, 'w') as f:      
    for j in train_index:
        row = int_spectra.iloc[j]
        for k in range(len(row)):
            f.write(str(row.iloc[k]))
            if k < len(row) - 1:
                f.write(',')
        f.write('\n')       

filename = './training_data/07_test_int_spectra.csv'
with open(filename, 'w') as f: 
    for j in test_index:
        row = int_spectra.iloc[j]
        for k in range(len(row)):
            f.write(str(row.iloc[k]))
            if k < len(row) - 1:
                f.write(',')
        f.write('\n')      

In [13]:
lines = [fit_spectra, resize_spectra]
filenames = ['./training_data/07_train_fit_spectra.csv', 
             './training_data/07_train_resize_spectra.csv']
for i in range(len(lines)):
    with open(filenames[i], 'w') as f:      
        for j in train_index:
            f.write(lines[i][j])   

filenames = ['./training_data/07_test_fit_spectra.csv', 
             './training_data/07_test_resize_spectra.csv']
for i in range(len(lines)):
    with open(filenames[i], 'w') as f:      
        for j in test_index:
            f.write(lines[i][j])   

In [14]:
dataframes = [fit_params_standard, elements]
filenames = ['./training_data/07_train_fit_params_standard.csv',
             './training_data/07_train_elements.csv']
for i in range(len(dataframes)):
    df = dataframes[i]
    with open(filenames[i], 'w') as f:
        for j in range(len(df.columns)):
            f.write(df.columns[j])
            if j < len(df.columns) - 1:
                f.write(',')
        f.write('\n')        
        for j in train_index:
            row = df.iloc[j]
            for k in range(len(row)):
                f.write(str(row.iloc[k]))
                if k < len(row) - 1:
                    f.write(',')
            f.write('\n')        

filenames = ['./training_data/07_test_fit_params_standard.csv',
             './training_data/07_test_elements.csv']
for i in range(len(dataframes)):
    df = dataframes[i]
    with open(filenames[i], 'w') as f:
        for j in range(len(df.columns)):
            f.write(df.columns[j])
            if j < len(df.columns) - 1:
                f.write(',')
        f.write('\n')        
        for j in test_index:
            row = df.iloc[j]
            for k in range(len(row)):
                f.write(str(row.iloc[k]))
                if k < len(row) - 1:
                    f.write(',')
            f.write('\n')        