In [None]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import KernelPCA

In [None]:
LH = pd.read_csv('data/LHDataClean.csv') # Load in .csv file

# Obtain the data to run KPCA on
imgs = {}
path = "/Users/cameronhudson/Documents/Masters/Research/Snow/256res_LH"
valid_images = [".jpg",".gif",".png",".tga"]
for f in os.listdir(path):
    ext = os.path.splitext(f)[1]
    if ext.lower() not in valid_images:
        continue
    imgs[f[0:5]] = Image.open(os.path.join(path,f))

img_df = pd.DataFrame(imgs.items(), columns=['Roll_Pic', 'images'])

LH = pd.concat([LH.set_index('Roll_Pic'),img_df.set_index('Roll_Pic')], axis=1, join='inner').reset_index()

LH = LH.drop([111]) # bad data point, drag coef. is around 10 times the magnitude of the rest.
LH = LH.reset_index()

test = pd.DataFrame(OneHotEncoder().fit_transform(LH[['Shape Class (Magono & Lee)']]))
test[0] = test[0].astype(str)
shapes = pd.DataFrame([test[0][i][6:8] for i in range(len(test))])[0].str.replace(')', '').astype(int)

# X = X.drop(['Roll_Pic', 'Shape Class (Magono & Lee)'], axis=1)
LH = LH.drop(['Roll_Pic'], axis=1)
LH = LH.join(shapes, how='inner')
LH = LH.rename(columns={'Mass [kg]': 'mass', 'Diameter [m]': 'diameter', 'Fall Velocity [m/s]': 'fall_velocity', 'Air Density [kg/m^3]': 'air_density', 'Air Viscosity[N*s/m^2]': 'air_viscosity', 0: 'shapes'})
target = LH['Drag Coefficient']
LH = LH.drop(['Shape Class (Magono & Lee)', 'Unnamed: 0', 'index'], axis=1)

LH['images'] = [i.convert('L') for i in LH['images']]

LH['images'] = [im.resize((1750, 1750)) for im in LH['images']]

# Getting data ready for KPCA dimensional rediciton
LH['images'] = LH['images'].apply(lambda img: np.array(img))
LH['flattened_image'] = LH['images'].apply(lambda arr: arr.flatten())
image_data = np.stack(LH['flattened_image'].values)

# scaling the image so the black background doesnt have a great effect on KPCA
scaler = StandardScaler()
scaled_image_data = scaler.fit_transform(image_data)


# Tested with 1, 3, 5
n_components = 3 # Adjust the number of components as needed
kpca = KernelPCA(n_components=n_components, kernel='rbf')
image_features = kpca.fit_transform(scaled_image_data)

df = pd.DataFrame(image_features)
# Adjust to the number of components of KPCA used, if n_components=3, then go up to KPCA3
df.columns = ['KPCA1', 'KPCA2', 'KPCA3']#, 'KPCA4', 'KPCA5'] #'KPCA6', 'KPCA7', 'KPCA8', 'KPCA9', 'KPCA10']

LH = LH.join(df, how='outer')

# Adjust to the number of components of KPCA used, if n_components=3, then go up to KPCA3
filtered_LH = LH[['Drag Coefficient', 'mass', 'diameter', 'fall_velocity', 'air_density', 'air_viscosity', 'Reynolds Number', 'shapes', 'KPCA1', 'KPCA2', 'KPCA3']]#, 'KPCA4', 'KPCA5']]# 'KPCA6', 'KPCA7', 'KPCA8', 'KPCA9', 'KPCA10']]
# Save new KPCA data in desired folder
filtered_LH.to_csv('/Users/cameronhudson/Documents/Masters/Research/Snow/data/LH_KPCA_3_highRes.csv')