In [3]:
import pandas as pd
import glob # glob module is needed to find all pathnames matching a specified pattern
import re # re module is needed for regular expressions in the parse_filename function 
import os # os module is needed for path normalization in the parse_filename function
from io import StringIO  # Correct import for StringIO

# Define the relative path to the datasets
data_path = './Bakery/*.s1p' # The * is a wildcard character that matches any character(s) in the filename
data_files = glob.glob(data_path) # glob.glob returns a list of pathnames that match the pattern

# Helper function to parse filenames for labels
def parse_filename(filepath):
    filepath = os.path.normpath(filepath) 
    filename = os.path.basename(filepath) 
    match = re.match(r'([A-B])_([1-3])_([1-9][0-9]*).s1p', filename)  
    if match:
        product_type = match.group(1)
        storage_condition = int(match.group(2))
        replicate = int(match.group(3))
        return product_type, storage_condition, replicate
    else:
        raise ValueError(f"Filename format not recognized: {filename}")

# List to collect all data
data_list = [] 

# Load and process each file
for file in data_files:
    file = os.path.normpath(file)
    
    # Read the file and skip any lines starting with '#' (comment lines)
    with open(file, 'r') as f:
        # Read all lines, skipping those starting with '#'
        lines = [line for line in f if not line.startswith('#')]
    
    # Load the filtered lines into a DataFrame
    df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
    
    # Parse labels from the filename
    product_type, storage_condition, replicate = parse_filename(file)
    
    # Add labels to the DataFrame
    df['product_type'] = product_type
    df['storage_condition'] = storage_condition
    df['replicate'] = replicate
    
    # Append to the main data list
    data_list.append(df)

# Concatenate all dataframes into one DataFrame
all_data = pd.concat(data_list, ignore_index=True)

# Encode product type as 0 (bread) and 1 (cookies)
all_data['product_type'] = all_data['product_type'].map({'A': 0, 'B': 1})

# One-hot encode storage condition
all_data = pd.get_dummies(all_data, columns=['storage_condition'], prefix='storage')

# Display the preprocessed data
all_data.head()


  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), d

Unnamed: 0,frequency,gain,phase,product_type,replicate,storage_1,storage_2,storage_3
0,300000000,0.987971,-0.033453,0,2,False,True,False
1,306000000,0.987575,-0.033633,0,2,False,True,False
2,312000000,0.987458,-0.03321,0,2,False,True,False
3,318000000,0.988083,-0.033681,0,2,False,True,False
4,324000000,0.988038,-0.033631,0,2,False,True,False


In [22]:
all_data.shape

(6060, 8)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prepare features (X) and target (y)
X = all_data[['frequency', 'gain', 'phase', 'storage_1', 'storage_2', 'storage_3']]
y = all_data['product_type']  # Change this to `storage_condition` for storage classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier (you can choose other classifiers here)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.971947194719472
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       885
           1       0.96      0.99      0.97       933

    accuracy                           0.97      1818
   macro avg       0.97      0.97      0.97      1818
weighted avg       0.97      0.97      0.97      1818



In [5]:
from keras.models import Sequential
from keras.layers import Dense, Activation

# Define the model
model = Sequential([
    Dense(32, input_shape=(X_train.shape[1],)), # 32 neurons, input shape is the number of features
    Activation('relu'), # ReLU activation function
    Dense(1), # 1 neuron
    Activation('sigmoid'), # Sigmoid activation function
])

2024-11-14 01:48:59.093395: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-14 01:48:59.093429: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-11-14 01:49:00.620091: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-11-14 01:49:00.620125: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2024-11-14 01:49:00.620362: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Yohannes21): /proc/driver/nvidia/version does not exist
2024-11-14 01:49:00.620713: I tensorflow/core/platform/cpu_feature