In [None]:
# Importing all the libraies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# LOADING THE DATASET

In [None]:
df = pd.read_csv("TASK-ML-INTERN.csv")

In [None]:
# Display first few rows

display(df.head())

In [None]:
# Checking Dataset info

df.info()

In [None]:
# Summary statistics
display(df.describe())

In [None]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values[missing_values > 0])

# DATA PREPROCESSING

In [None]:
# Handling missing values (Only for numeric columns)
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)



In [None]:
# Normalize spectral data (excluding target variable)
from sklearn.preprocessing import MinMaxScaler

spectral_columns = [col for col in df.columns if col not in ['vomitoxin_ppb', 'hsi_id']]
scaler = MinMaxScaler()
df[spectral_columns] = scaler.fit_transform(df[spectral_columns])


In [None]:
# Visualizing spectral distributions
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[spectral_columns])
plt.xticks(rotation=90)
plt.title("Distribution of Normalized Spectral Features")
plt.show()

# DIMENTIONALITY REDUCTION

In [None]:
# Applying PCA for dimentionaliy reduction
pca = PCA(n_components=50)  # Reduce to 2D for visualization
pca_result = pca.fit_transform(df[spectral_columns])

In [None]:
# Explained variance
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance by Components: {explained_variance}")

In [None]:
# Scatter plot of PCA components
plt.figure(figsize=(10, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=df['vomitoxin_ppb'], cmap='viridis', alpha=0.7)
plt.colorbar(label='Vomitoxin (ppb)')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Visualization of Spectral Data")
plt.show()


# MODEL TRAINING

In [None]:
# Split data into training and testing sets

X = df[spectral_columns].values.reshape(-1, len(spectral_columns), 1)  # Reshape for CNN input
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y = y_scaler.fit_transform(df[['vomitoxin_ppb']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Build an optimized CNN model
model = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='linear')  # Regression output
])

model.compile(optimizer=Adam(learning_rate=0.001), loss=Huber(), metrics=['mae'])


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)


# MODEL EALUATION

In [None]:
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

In [None]:

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# Scatter plot of actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')  # Ideal fit line
plt.xlabel("Actual Vomitoxin (ppb)")
plt.ylabel("Predicted Vomitoxin (ppb)")
plt.title("Actual vs. Predicted Values")
plt.show()

## 📊 Model Performance Summary

### ✅ **Performance Metrics**
- **Mean Absolute Error (MAE):** 0.0338
- **Root Mean Squared Error (RMSE):** 0.0798
- **R² Score:** 0.6092

The model shows **moderate predictive accuracy**, with an **R² score of 0.6092**, meaning it explains about **60.9% of the variance** in the data. The **MAE and RMSE values are relatively low**, indicating that the predictions are not too far from actual values.

### ⚠️ **Limitations & Future Improvements**
1. **Data Dimensionality:** PCA reduced feature dimensions, which might have caused some **loss of information**.  
2. **Model Complexity:** The CNN model might benefit from **further hyperparameter tuning** or **a hybrid approach (CNN + LSTM)**.  
3. **Dataset Size:** More data samples could improve model generalization.  
4. **Alternative Models:** Exploring **transformer-based models** (e.g., Attention Networks) might improve performance.  

### 🚀 **Future Enhancements**
- **Hyperparameter tuning** (Grid Search for best filters, kernel sizes).  
- **Alternative architectures** (LSTM, Transformer models).  
- **Data augmentation** to enhance the dataset for better learning.  
