# Apply PCA to input data (50 emission and 50 dispersion)

In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
#import pandas as pd
import csv

# File path with samples
file_path = '/Users/andresmr/Documents/Glyphosate_sensor_CFATA/Old_NN_files/EntrenaNNTare/sep17v2.csv'

# Open file with samples
with open(file_path, 'r') as file:
    reader = csv.reader(file)
    n = 0
    data = []
    for i,row in enumerate(reader):
        data.append(row)
        # if n<28:
        #     data.append(row)
        # n+=1
len(data)

# X and y data lists
_inputs = []
_targets = []

# Add features (X,y) to list varibles
for i in range(len(data)):
        row_data = []
        for j in range(3,104):
                if j!= 53:
                        row_data.append(float(data[i][j]))
        _inputs.append(row_data)
        _targets.append(float(data[i][1]))

# Convert list to arrays
X_data = np.array(_inputs)
y_data= np.array(_targets)

print(f"Original shape of training data: {X_data.shape}")

# --- Step 1: Scale the Data ---
# PCA is sensitive to feature scales, so we standardize the data first.
# IMPORTANT: You fit the scaler ONLY on the training data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_data)

# --- Step 2: Apply PCA ---
# We'll tell PCA to automatically select enough components to explain 95% of the variance.
pca = PCA(n_components=0.95)

# Fit PCA on the scaled training data and then transform it.
X_train_pca = pca.fit_transform(X_train_scaled)

print(f"\nNumber of features after PCA: {pca.n_components_}")
print(f"Shape of data after PCA: {X_train_pca.shape}")

print(pca.components_)
# X_train_pca is your new, lower-dimensional training dataset.
# You can now use this to train your regression or MLP model.

# --- Step 3: Handling New/Test Data ---
# When you need to make predictions on a test set (X_test), you must use
# the SAME scaler and pca objects you already fitted.

# Example:
# X_test = ... your test data ...
# X_test_scaled = scaler.transform(X_test)  # Note: just .transform(), not .fit_transform()
# X_test_pca = pca.transform(X_test_scaled)    # Note: just .transform(), not .fit_transform()

Original shape of training data: (56, 100)

Number of features after PCA: 8
Shape of data after PCA: (56, 8)
[[-5.90635179e-02 -3.22040108e-02 -6.09216571e-02 -4.23339715e-02
  -2.41747260e-03 -5.81994644e-02  1.91665328e-02  3.27493453e-02
   4.22431634e-02  7.35269033e-02  1.29947610e-01  1.31186013e-01
   1.29421006e-01  1.33421333e-01  1.33891953e-01  1.34594011e-01
   1.35920006e-01  1.35895064e-01  1.36161718e-01  1.36853969e-01
   1.35867788e-01  1.35011546e-01  1.36134126e-01  1.37373680e-01
   1.37092209e-01  1.37608217e-01  1.37286195e-01  1.37042241e-01
   1.37417867e-01  1.37832048e-01  1.37280019e-01  1.37176088e-01
   1.37086681e-01  1.37104799e-01  1.37246762e-01  1.36844297e-01
   1.36407537e-01  1.36275211e-01  1.36617351e-01  1.36391241e-01
   1.36788347e-01  1.35699159e-01  1.35471379e-01  1.36076995e-01
   1.35249325e-01  1.35367935e-01  1.35029043e-01  1.35553821e-01
   1.35447012e-01  1.34658669e-01 -5.99878381e-02 -5.71924859e-02
  -6.09067793e-02 -5.13347052e-02

### Outliers

In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

# First, let's create some sample training data.
# In your code, you would use your actual X_train data.
np.random.seed(42)
X_train = pd.DataFrame(X_data)

# Now, let's add some obvious outliers to the data for demonstration
outliers_to_add = pd.DataFrame(np.random.uniform(low=-10, high=10, size=(5, 10)))
X_train = pd.concat([X_train, outliers_to_add]).reset_index(drop=True)

print(f"Total shape of the training data (including outliers): {X_train.shape}")

# --- Step 1: Initialize and Fit the Model ---
# The 'contamination' parameter is the expected proportion of outliers.
# 'auto' is a good starting point. You can also set a specific value, like 0.05 for 5%.
iso_forest = IsolationForest(contamination='auto', random_state=42)

# Fit the model to your data
iso_forest.fit(X_train)

# --- Step 2: Predict the Outliers ---
# The model.predict() method returns -1 for outliers and 1 for inliers.
predictions = iso_forest.predict(X_train)
X_train['is_outlier'] = predictions

# --- Step 3: Analyze the Results ---
# Count the number of outliers found
outlier_count = (predictions == -1).sum()
print(f"\nNumber of outliers detected: {outlier_count}")

# You can now easily separate the outliers from the clean data
outliers = X_train[X_train['is_outlier'] == -1]
clean_data = X_train[X_train['is_outlier'] == 1]

print("\nShowing the first 5 detected outliers:")
print(outliers.head())

print(f"\nShape of the data after removing outliers: {clean_data.shape}")

Total shape of the training data (including outliers): (61, 100)

Number of outliers detected: 18

Showing the first 5 detected outliers:
      0     1     2    3     4     5     6     7     8     9  ...     91  \
0   0.0  -3.0  14.0  1.0  -2.0   4.0   8.0  -9.0  -2.0 -12.0  ...   23.0   
10  7.0  20.0  17.0  0.0  19.0  19.0 -10.0 -26.0 -13.0 -27.0  ...   63.0   
11  0.0  12.0  12.0 -6.0  -6.0  -3.0  14.0 -12.0  -3.0  -8.0  ...  -40.0   
12 -4.0   3.0  13.0  4.0  -7.0   4.0  11.0  -8.0  -2.0 -18.0  ... -104.0   
13 -3.0  -5.0  23.0  5.0 -13.0  18.0  -1.0 -10.0   3.0 -20.0  ... -107.0   

      92     93     94     95     96     97     98     99  is_outlier  
0   38.0   80.0   68.0   90.0   59.0   90.0  100.0   84.0          -1  
10  84.0   77.0  104.0   98.0  144.0  133.0  120.0  127.0          -1  
11  55.0  118.0  140.0  169.0  184.0  154.0  150.0  164.0          -1  
12 -64.0    9.0   38.0  125.0  132.0  114.0  117.0  101.0          -1  
13  95.0   37.0   93.0   85.0  131.0  142.0  

### Read samples and get number of features per file

In [7]:
import os
import csv

def get_first_row_column_counts(folder_path: str) -> dict:
    """
    Reads all .csv and .txt files in a folder and returns the number of
    columns in the first row for each file.

    Args:
        folder_path: The path to the folder you want to scan.

    Returns:
        A dictionary mapping filenames to their first-row column count.
        Includes an error message if a file cannot be read.
    """
    column_counts = {}
    
    # Ensure the provided path is a valid directory
    if not os.path.isdir(folder_path):
        return {"error": "The provided path is not a valid directory."}

    # Loop through each item in the specified directory
    for filename in os.listdir(folder_path):
        # Check if the file is a .csv or .txt file
        if filename.lower().endswith(('.csv', '.txt')):
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    # Read only the first line to be efficient
                    first_line = f.readline()

                    # Handle empty files
                    if not first_line:
                        column_counts[filename] = 0
                        continue
                    
                    # For CSV files, use the csv module to correctly handle
                    # potential quoting (e.g., "a,b",c -> 2 columns)
                    if filename.lower().endswith('.csv'):
                        reader = csv.reader([first_line])
                        header = next(reader)
                        column_counts[filename] = len(header)
                    
                    # For TXT files, guess the delimiter (comma, then tab)
                    else:
                        if ',' in first_line:
                            delimiter = ','
                        elif '\t' in first_line:
                            delimiter = '\t'
                        else:
                            # Fallback to splitting by whitespace for other cases
                            delimiter = None 
                        
                        columns = first_line.strip().split(delimiter)
                        column_counts[filename] = len(columns)

            except Exception as e:
                column_counts[filename] = f"Error: {e}"
                
    return column_counts

In [9]:
# Call the function with the path to your folder
file_column_info = get_first_row_column_counts('/Users/andresmr/Documents/Glyphosate_sensor_CFATA/samples')

# Print the results
for filename, count in file_column_info.items():
    print(f"'{filename}': {count} columns")

'a16f.csv': 104 columns
'19julv2.csv': 104 columns
'jul21.csv': 104 columns
'16ago.csv': 104 columns
'ago18.csv': 104 columns
'mocho3.csv': 104 columns
'ago26.csv': 104 columns
'mocho2.csv': 104 columns
'mochos.csv': 104 columns
'last.csv': 104 columns
'19jul.csv': 104 columns
'20julv1.csv': 104 columns
'mocho4.csv': 104 columns
'mochos5.csv': 104 columns
'45med20mar.csv': 45 columns
's220jul.csv': 104 columns
'ago26M4.csv': 104 columns
'16agov2.csv': 104 columns
'45med26marzo2024.csv': 45 columns
'45med9marzo.txt': 45 columns
