# Task 4: Binary Classification - Data Consolidation & Preprocessing

Combine the two training datasets (`Training_part1.csv` and `Training_part2.csv`) using the `id` column, preprocess the data, split into train/test sets, and save the result.

In [None]:
import pandas as pd

# Define file paths
file_path1 = 'data/Training_part1.csv'
file_path2 = 'data/Training_part2.csv'

# Load the datasets
df1 = pd.read_csv(file_path1, sep=";")
df2 = pd.read_csv(file_path2, sep=";")

# Display the first few rows and info of each dataframe to understand their structure
print("First DataFrame (df1) Head:")
print(df1.head())
print("\nFirst DataFrame (df1) Info:")
df1.info()

print("\nSecond DataFrame (df2) Head:")
print(df2.head())
print("\nSecond DataFrame (df2) Info:")
df2.info()

In [None]:
# Merge the two dataframes using the 'id' column
# We'll use an inner merge by default, keeping only rows where 'id' exists in both dataframes.
merged_df = pd.merge(df1, df2, on='id')

# Display the first few rows and info of the merged dataframe
print("\nMerged DataFrame Head:")
print(merged_df.head())
print("\nMerged DataFrame Info:")
merged_df.info()

# Check for the number of unique IDs to ensure the merge was as expected
print(f"\nNumber of rows in df1: {len(df1)}")
print(f"Number of rows in df2: {len(df2)}")
print(f"Number of rows in merged_df: {len(merged_df)}")
print(f"Number of unique IDs in merged_df: {merged_df['id'].nunique()}")

# Data Preprocessing

Before building the classification model, we need to preprocess the data. The steps typically include:

1.  **Handling Missing Values:** Identify and decide how to handle any missing data (e.g., imputation, removal).
2.  **Feature Selection/Engineering:** Decide which features to use. The `id` column is likely not useful for prediction and should be dropped. We might also create new features if needed.
3.  **Encoding Categorical Features:** Convert any non-numeric features into a numeric format suitable for machine learning models (e.g., one-hot encoding).
4.  **Feature Scaling:** Scale numerical features to have a similar range (e.g., using StandardScaler) to prevent features with larger values from dominating the model.
5.  **Splitting Data:** Divide the dataset into training and testing sets to evaluate the model's performance on unseen data.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # Added import

# Make a copy to avoid modifying the original merged dataframe
processed_df = merged_df.copy()

# 1. Handling Missing Values (Initial Check)
print("\nMissing values per column (initial):")
print(processed_df.isnull().sum())

# 2. Feature Selection
# Drop the 'id' column as it's just an identifier
processed_df = processed_df.drop('id', axis=1)

# Separate features (X) and target (y)
y = processed_df['Class']
X = processed_df.drop('Class', axis=1)

# 3. Encoding Categorical Features and Data Cleaning
print("\nData types of features before encoding/cleaning:")
print(X.dtypes)

# Identify categorical columns based on dtype
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
print(f"\nIdentified categorical columns for one-hot encoding: {list(categorical_cols)}")

# Apply one-hot encoding to these categorical columns
if len(categorical_cols) > 0:
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True, dummy_na=False)
    print("\nFeatures after one-hot encoding:")
    print(X.head())
    print("\nData types after one-hot encoding:")
    print(X.dtypes)
else:
    print("\nNo explicit (object/category) categorical columns found for one-hot encoding.")

# Attempt to convert all columns to numeric.
# This will turn any remaining non-numeric strings (like 'iii' in a column not caught by select_dtypes or already float/int) into NaN.
print("\nAttempting to convert all columns to numeric, coercing errors...")
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

print("\nData types after coercing all columns to numeric:")
print(X.dtypes)

print("\nMissing values after coercing to numeric (includes original NaNs and coerced strings):")
missing_values_summary = X.isnull().sum()
print(missing_values_summary[missing_values_summary > 0]) # Show only columns with missing values

# 5. Splitting Data (BEFORE imputation and scaling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Impute missing values (those created by 'coerce' or originally present)
# Fit imputer ONLY on X_train to prevent data leakage
imputer = SimpleImputer(strategy='mean') # Using mean, can be changed to median, most_frequent

# Save column names and index before imputation, as imputer returns numpy array
X_train_columns = X_train.columns
X_train_index = X_train.index
X_test_columns = X_test.columns
X_test_index = X_test.index

print("\nImputing missing values...")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Convert back to DataFrame
X_train = pd.DataFrame(X_train, columns=X_train_columns, index=X_train_index)
X_test = pd.DataFrame(X_test, columns=X_test_columns, index=X_test_index)

print("\nMissing values in X_train after imputation (should be 0):")
print(X_train.isnull().sum().sum())
print("Missing values in X_test after imputation (should be 0):")
print(X_test.isnull().sum().sum())

# 4. Feature Scaling (Now X_train and X_test should be purely numeric and imputed)
scaler = StandardScaler()
# Fit scaler ONLY on training data
X_train_scaled_np = scaler.fit_transform(X_train)
# Transform both training and test data
X_test_scaled_np = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled_np, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled_np, columns=X_test.columns, index=X_test.index)

print("\nScaled Training Data Head:")
print(X_train_scaled.head())

In [None]:
# Combine features and target for train/test sets
train_df = X_train_scaled.copy()
train_df['Class'] = y_train.values # Use .values to align correctly if indices differ

test_df = X_test_scaled.copy()
test_df['Class'] = y_test.values # Use .values to align correctly if indices differ

# Add split indicator
train_df['split'] = 'train'
test_df['split'] = 'test'

# Concatenate train and test sets
# Use ignore_index=True if the original index isn't important for later steps
final_processed_df = pd.concat([train_df, test_df], ignore_index=False) 

# Define output path
output_parquet_path = 'data/preprocessed_data.parquet'

# Save to Parquet
print(f"\nSaving preprocessed data to {output_parquet_path}...")
try:
    # Ensure the target 'Class' column is treated appropriately if it's not numeric (e.g., convert 'y'/'n' to 1/0 if needed before saving)
    # Example: final_processed_df['Class'] = final_processed_df['Class'].map({'y': 1, 'n': 0})
    # Check dtypes before saving
    print("\nFinal DataFrame Info before saving:")
    final_processed_df.info()
    
    final_processed_df.to_parquet(output_parquet_path, index=True) # Save with index
    print(f"Successfully saved preprocessed data to {output_parquet_path}.")
except ImportError:
    print("\nError: 'pyarrow' or 'fastparquet' package is required to write to Parquet format.")
    print("Please install it using: pip install pyarrow")
except Exception as e:
    print(f"\nAn error occurred while saving to Parquet: {e}")