In [6]:
# Import necessary libraries
import pandas as pd
from pathlib import Path
import os

# Define paths
project_root = Path().resolve().parent  # Adjust as needed
data_path = project_root / "data" / "processed_data.parquet"
output_dir = project_root / "data"
script_path = project_root / "src" / "model_training.py"

# Step 1: Check if the processed data file exists
print("Checking if processed data file exists...")
if data_path.exists():
    print(f"Processed data found: {data_path}")
else:
    raise FileNotFoundError(f"Processed data not found at {data_path}")

# Step 2: Run the `model_training.py` script
print("\nRunning model_training.py script...")
try:
    exec(open(script_path).read())
    print("model_training.py executed successfully.")
except Exception as e:
    print("Error running model_training.py:")
    print(e)
    raise

# Step 3: Verify output files
print("\nVerifying output files...")
required_files = ["X_train.parquet", "X_test.parquet", "y_train.parquet", "y_test.parquet"]
missing_files = []
for file in required_files:
    file_path = output_dir / file
    if file_path.exists():
        print(f"File exists: {file}")
    else:
        print(f"Missing file: {file}")
        missing_files.append(file)

if missing_files:
    raise FileNotFoundError(f"Missing output files: {missing_files}")
else:
    print("All output files are present.")

# Step 4: Inspect and validate output files
print("\nLoading and inspecting output files...")
try:
    X_train = pd.read_parquet(output_dir / "X_train.parquet")
    X_test = pd.read_parquet(output_dir / "X_test.parquet")
    y_train = pd.read_parquet(output_dir / "y_train.parquet")
    y_test = pd.read_parquet(output_dir / "y_test.parquet")

    # Print shapes
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")

    # Print first few rows of X_train and y_train for inspection
    print("\nFirst few rows of X_train:")
    print(X_train.head())
    print("\nFirst few rows of y_train:")
    print(y_train.head())
    
    # Check for null values
    assert not X_train.isnull().values.any(), "Null values found in X_train"
    assert not X_test.isnull().values.any(), "Null values found in X_test"
    assert not y_train.isnull().values.any(), "Null values found in y_train"
    assert not y_test.isnull().values.any(), "Null values found in y_test"
    print("No null values detected in the split datasets.")
    
    # Additional check: Ensure data types are consistent
    print("\nData types in X_train:")
    print(X_train.dtypes)
except Exception as e:
    print("Error loading or validating output files:")
    print(e)
    raise

# Step 5: Debugging helper in case of failure
if missing_files:
    print("\nDebugging suggestions:")
    print("- Ensure that the `model_training.py` script writes output files using `to_parquet`.")
    print(f"- Confirm the output directory: {output_dir}")
    print("- Check for typos in file names or paths.")
    print("- Check for permission issues or write errors.")

print("\nTesting completed successfully.")


Checking if processed data file exists...
Processed data found: C:\Users\yings\OneDrive\桌面\D100-D400_Project\data\processed_data.parquet

Running model_training.py script...
Loading cleaned data...
Cleaned data loaded successfully with shape: (1518, 141)
Splitting data into features (X) and target (y)...
Features shape: (1518, 140), Target shape: (1518,)
Splitting dataset into training and testing sets...
Training set shape: (1214, 140), Testing set shape: (304, 140)
Saving split datasets to C:\Users\yings\OneDrive\桌面\D100-D400_Project\data...
Datasets saved successfully.
First few rows of training features:
      Kilometer    Engine    Length     Width    Height  Seating Capacity  \
1315   0.382148  0.361741  0.497339  0.563830  0.584211               5.0   
1300   0.096538  0.539891  0.851567  0.742553  0.301754               5.0   
843    0.574961  0.302811  0.523950  0.653191  0.698246               5.0   
1343   0.072643  0.183137  0.246008  0.468085  0.408772               5.0   