### Feature Selection - Using Mutual Information
**Description**: Use mutual information for feature selection to identify important features.

In [1]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import unittest

# ---------------------- Generate Sample Data ----------------------
# Synthetic classification dataset
X, y = make_classification(n_samples=200, n_features=6, n_informative=3,
                           n_redundant=1, random_state=42, shuffle=False)

feature_names = ['feature_1', 'feature_2', 'feature_3',
                 'feature_4', 'feature_5', 'feature_6']
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# ---------------------- Error Handling ----------------------
def validate_dataframe(df, target_col):
    if df.empty:
        raise ValueError("DataFrame is empty.")
    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not found.")
    if df[target_col].nunique() <= 1:
        raise ValueError("Target column must have more than one class.")
    return True

try:
    validate_dataframe(df, 'target')
except Exception as e:
    print(f"Validation Error: {e}")

# ---------------------- Mutual Information ----------------------
X = df.drop('target', axis=1)
y = df['target']

# Compute mutual information
mi_scores = mutual_info_classif(X, y, random_state=42)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Select top N features (e.g., top 3)
top_features = mi_series.head(3).index.tolist()
X_selected = X[top_features]

print("Top Features by Mutual Information:\n", mi_series)
print("\nSelected Top 3 Features:\n", top_features)
print("\nReduced Feature Set:\n", X_selected.head())

# ---------------------- Unit Tests ----------------------
class TestMutualInformationSelection(unittest.TestCase):
    def test_non_empty_selection(self):
        self.assertGreater(len(X_selected.columns), 0, "No features selected.")

    def test_expected_column_presence(self):
        for col in top_features:
            self.assertIn(col, X_selected.columns)

    def test_mutual_information_values(self):
        self.assertTrue(all(val >= 0 for val in mi_series),
                        "All mutual information values should be non-negative.")

unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestMutualInformationSelection))


...
----------------------------------------------------------------------
Ran 3 tests in 0.002s

OK


Top Features by Mutual Information:
 feature_1    0.309414
feature_2    0.195494
feature_4    0.073346
feature_3    0.063785
feature_5    0.000000
feature_6    0.000000
dtype: float64

Selected Top 3 Features:
 ['feature_1', 'feature_2', 'feature_4']

Reduced Feature Set:
    feature_1  feature_2  feature_4
0  -0.755184  -1.184062  -0.615488
1  -0.466941  -0.743048  -0.108170
2  -0.995675  -0.791000  -0.762148
3  -1.303612   0.046448  -0.766858
4  -1.641297  -1.106601  -1.704311


<unittest.runner.TextTestResult run=3 errors=0 failures=0>