## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print(" Libraries imported successfully!")

## Step 2: Load Data

In [None]:
# Load datasets
train = pd.read_csv('/mnt/user-data/uploads/train.csv')
test = pd.read_csv('/mnt/user-data/uploads/test.csv')
sample = pd.read_csv('/mnt/user-data/uploads/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nColumns: {train.columns.tolist()}")

## Step 3: Explore Data

In [None]:
# Display sample data
train.head()

In [None]:
# Check for missing values
print("Missing values:")
print(train.isnull().sum())
print(f"\nTarget (Humidity) stats:")
print(train['Humidity'].describe())

In [None]:
# Visualize target distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(train['Humidity'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Humidity')
plt.ylabel('Frequency')
plt.title('Humidity Distribution')

plt.subplot(1, 2, 2)
train[['Temperature (C)', 'Apparent Temperature (C)', 'Wind Speed (km/h)', 
       'Visibility (km)', 'Pressure (millibars)']].boxplot()
plt.xticks(rotation=45)
plt.title('Feature Distributions')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target
numeric_cols = train.select_dtypes(include=[np.number]).columns
correlations = train[numeric_cols].corr()['Humidity'].sort_values(ascending=False)
print("Correlation with Humidity:")
print(correlations)

## Step 4: Feature Engineering

In [None]:
def engineer_features(df):
    """Create new features from existing ones"""
    df = df.copy()
    
    # Temperature difference
    df['Temp_Diff'] = df['Temperature (C)'] - df['Apparent Temperature (C)']
    
    # Wind features
    df['Wind_Speed_Sq'] = df['Wind Speed (km/h)'] ** 2
    
    # Cyclical encoding for wind direction
    df['Wind_Sin'] = np.sin(np.radians(df['Wind Bearing (degrees)']))
    df['Wind_Cos'] = np.cos(np.radians(df['Wind Bearing (degrees)']))
    
    # Pressure deviation from standard
    df['Pressure_Dev'] = df['Pressure (millibars)'] - 1013.25
    
    # Visibility log transform
    df['Visibility_Log'] = np.log1p(df['Visibility (km)'])
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print(" Features engineered!")
print(f"New features: Temp_Diff, Wind_Speed_Sq, Wind_Sin, Wind_Cos, Pressure_Dev, Visibility_Log")

## Step 5: Prepare Features

In [None]:
# Select numerical features only (exclude text columns and target)
exclude = ['Idx', 'Humidity', 'Summary', 'Precip Type', 'Daily Summary']
feature_cols = [col for col in train.columns if col not in exclude]

print(f"Selected {len(feature_cols)} features:")
for col in feature_cols:
    print(f"  - {col}")

# Prepare X and y
X = train[feature_cols].copy()
y = train['Humidity'].copy()
X_test = test[feature_cols].copy()

print(f"\nBefore cleaning:")
print(f"X shape: {X.shape}")
print(f"NaN in X: {X.isna().sum().sum()}")
print(f"NaN in y: {y.isna().sum()}")
print(f"NaN in X_test: {X_test.isna().sum().sum()}")

In [None]:
# Handle NaN values
# Fill missing values with median
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())

# Remove any remaining NaN in features with 0
X = X.fillna(0)
X_test = X_test.fillna(0)

# Handle target - drop rows with NaN in target
valid_mask = ~y.isna()
X = X[valid_mask]
y = y[valid_mask]

# Reset indices
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(f"\nAfter cleaning:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"NaN in X: {X.isna().sum().sum()}")
print(f"NaN in y: {y.isna().sum()}")
print(f"NaN in X_test: {X_test.isna().sum().sum()}")
print(f"\n Data ready for training!")

## Step 6: Train-Validation Split

In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_full_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print(" Data scaled!")

## Step 7: Train Multiple Models

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01, max_iter=5000),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
}

results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    
    # Evaluate
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    results[name] = {
        'model': model,
        'train_r2': train_r2,
        'val_r2': val_r2,
        'val_rmse': val_rmse
    }
    
    print(f"  Train R¬≤: {train_r2:.4f}")
    print(f"  Val R¬≤:   {val_r2:.4f}")
    print(f"  Val RMSE: {val_rmse:.4f}")

print("\n All models trained!")

## Step 8: Compare Models

In [None]:
# Create comparison DataFrame
comparison = pd.DataFrame([
    {
        'Model': name,
        'Train R¬≤': results[name]['train_r2'],
        'Val R¬≤': results[name]['val_r2'],
        'Val RMSE': results[name]['val_rmse']
    }
    for name in results.keys()
]).sort_values('Val R¬≤', ascending=False)


print(comparison.to_string(index=False))

# Select best model
best_name = comparison.iloc[0]['Model']
best_model = results[best_name]['model']

print(f"\nüèÜ Best Model: {best_name}")
print(f"   Validation R¬≤: {results[best_name]['val_r2']:.4f}")

In [None]:
# Visualize comparison
plt.figure(figsize=(10, 6))
x_pos = np.arange(len(comparison))
plt.bar(x_pos - 0.2, comparison['Train R¬≤'], 0.4, label='Train R¬≤', alpha=0.8)
plt.bar(x_pos + 0.2, comparison['Val R¬≤'], 0.4, label='Val R¬≤', alpha=0.8)
plt.xlabel('Model')
plt.ylabel('R¬≤ Score')
plt.title('Model Performance Comparison')
plt.xticks(x_pos, comparison['Model'], rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Step 9: Final Training on Full Dataset

In [None]:
# Retrain best model on full training data
print(f"Retraining {best_name} on full dataset...")
best_model.fit(X_full_scaled, y)
print(" Training complete!")

## Step 10: Make Predictions

In [None]:
# Predict on test set
predictions = best_model.predict(X_test_scaled)

# Clip to valid range [0, 1]
predictions = np.clip(predictions, 0, 1)

print(f"Predictions statistics:")
print(f"  Min:  {predictions.min():.4f}")
print(f"  Max:  {predictions.max():.4f}")
print(f"  Mean: {predictions.mean():.4f}")
print(f"  Std:  {predictions.std():.4f}")

In [None]:
# Visualize predictions
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(predictions, bins=50, edgecolor='black', alpha=0.7, label='Test Predictions')
plt.hist(y, bins=50, alpha=0.5, label='Train Target')
plt.xlabel('Humidity')
plt.ylabel('Frequency')
plt.title('Prediction Distribution')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot([y, predictions], labels=['Train', 'Test Pred'])
plt.ylabel('Humidity')
plt.title('Distribution Comparison')

plt.tight_layout()
plt.show()

## Step 11: Create Submission File

In [None]:
# Create submission
submission = pd.DataFrame({
    'Idx': test['Idx'],
    'Humidity': predictions
})

print("Submission preview:")
print(submission.head(10))
print(f"\nShape: {submission.shape}")

In [None]:
# Save to file
output_path = '/mnt/user-data/outputs/submission.csv'
submission.to_csv(output_path, index=False)

print(f"Submission saved to: {output_path}")

# Verify
verify = pd.read_csv(output_path)
print(f"\nVerification:")
print(f"  Rows: {len(verify)}")
print(f"  Columns: {list(verify.columns)}")
print(f"  First few rows:")
print(verify.head())