In [None]:
# Importing libraries
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Method 1: Using Chi Square Test
Chi-Square is to be used when the feature is categorical, the target variable is any way can be thought as categorical. It measures the degree of association between two categorical variables.

The Chi-Squared statistics are calculated using the following formula where “O” stands for observed or actual and “E” stands for expected value if these two categories are independent. If they are independent these O and E values will be close and if they have some association then the Chi-squared value will be high.

![image.png](attachment:image.png)

In [None]:
#Loading Dataset
df=pd.read_csv('/content/drive/MyDrive/CONVAI/MY_LAB/My trails/wine.csv')
df

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


Implementation from Scratch

In [None]:
def chi_square(df, attribute, target):
    # Get unique values from the attribute and target columns
    attr_values = df[attribute].unique()  # Unique age groups
    target_values = df[target].unique()    # Unique drink preferences

    # Total number of observations
    N = len(df)
    chi_square = 0  # Initialize the chi-square statistic

    # Loop through each combination of attribute and target values
    for value in attr_values:  # e.g., "Youth", "Adult", "Senior"
        for value1 in target_values:  # e.g., "Tea", "Coffee"
            # Calculate observed frequency
            observed_freq = len(df[(df[attribute] == value) & (df[target] == value1)])
            #print(observed_freq)
            # Calculate marginal probabilities
            prob_attribute = len(df[df[attribute] == value])
            prob_target = len(df[df[target] == value1])

            # Calculate expected frequency
            expected_freq = (prob_attribute * prob_target) / N

            # Update the chi-square statistic
            if expected_freq > 0:  # To avoid division by zero
                chi_square += ((observed_freq - expected_freq) ** 2) / expected_freq

    # Return the normalized chi-square statistic
    return chi_square / N

In [None]:
import pandas as pd

# Sample dataset
data = {
    'Age Group': ['Youth', 'Youth', 'Adult', 'Adult', 'Senior', 'Senior', 'Senior'],
    'Drink': ['Tea', 'Coffee', 'Tea', 'Tea', 'Coffee', 'Coffee', 'Tea']
}

df = pd.DataFrame(data)



# Applying the chi-square function on our dataset
chi_square_value = chi_square(df, 'Age Group', 'Drink')
print(f"Chi-square Statistic: {chi_square_value:.4f}")


Chi-square Statistic: 0.3194


In [None]:
# Calculate degrees of freedom
from scipy.stats import chi2
num_attr_categories = len(df['Age Group'].unique())  # Number of unique age groups
num_target_categories = len(df['Drink'].unique())    # Number of unique drinks
df_degrees_of_freedom = (num_attr_categories - 1) * (num_target_categories - 1)

# Determine the critical value
alpha = 0.05  # Significance level
critical_value = chi2.ppf(1 - alpha, df_degrees_of_freedom)
print(f"Critical Value at alpha={alpha}: {critical_value:.4f}")

# Compare chi-square value with the critical value
if chi_square_value > critical_value:
    print("Reject the null hypothesis: There is a significant association between Age Group and Drink.")
else:
    print("Fail to reject the null hypothesis: There is no significant association between Age Group and Drink.")

Critical Value at alpha=0.05: 5.9915
Fail to reject the null hypothesis: There is no significant association between Age Group and Drink.


Using the above function, we can compute the chi-square score between each input feature and choose the K-best features

Implementation using In-built function

In [None]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [None]:
from sklearn import datasets
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
# k = 4 tells four top features to be selected
# Score function Chi2 tells the feature to be selected using Chi Square
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X,Y)
fit.scores_

In [None]:
X_new=test.fit_transform(X, Y)
X_new

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2

# Example dataset
data = {
    'Feature1': [1, 0, 1, 0, 1],
    'Feature2': [2, 3, 1, 4, 3],
    'Feature3': [0, 1, 0, 1, 0],
    'Feature4': [5, 5, 5, 6, 6],
    'Target': [1, 0, 1, 0, 1]
}

df = pd.DataFrame(data)

# Features and target variable
X = df.drop('Target', axis=1)  # Feature matrix
Y = df['Target']  # Target variable

# Select the top 2 features based on the Chi-Squared score
test = SelectKBest(score_func=chi2, k=2)
fit = test.fit(X, Y)

# Display the scores for each feature
scores = fit.scores_
features = X.columns

# Combine scores with feature names
feature_scores = pd.DataFrame({'Feature': features, 'Score': scores})
print(feature_scores)

# Get the top features based on scores
top_features = feature_scores.nlargest(2, 'Score')  # Change to 4 for top 4
print("\nTop Features:\n", top_features)


    Feature     Score
0  Feature1  2.000000
1  Feature2  1.038462
2  Feature3  3.000000
3  Feature4  0.006173

Top Features:
     Feature  Score
2  Feature3    3.0
0  Feature1    2.0


The values are different because the in-built function uses some kind of normalization

# Method 2: Using Information Gain

In [None]:
def find_entropy(df):
    Class=df.keys()[-1]
    values=df[Class].unique()
    entropy=0
    for value in values:
        prob=len(df[df[Class]==value])/len(df[Class])
        entropy+=-prob*np.log2(prob)
    return entropy

In [None]:
def find_information_entropy(df,attribute):
    Class=df.keys()[-1]
    labels=df[Class].unique()
    attr_values=df[attribute].unique()
    avg_entropy=0
    split_info=0
    for value1 in attr_values:
        entropy_subsample=0
        for value2 in labels:
            num=len(df[attribute][df[attribute]==value1][df[Class]==value2])
            #num = len(df[(df[attribute] == value1) & (df['Class'] == value2)])
            #den = len(df[df[attribute] == value1])
            den=len(df[attribute][df[attribute]==value1])
            prob=num/den
            entropy_subsample+=-(prob*np.log2(prob+1e-7))
        weight=den/len(df)
        avg_entropy+=weight*entropy_subsample
    return avg_entropy

In [None]:
def information_gain(df,attribute):
    return (find_entropy(df)-find_information_entropy(df,attribute))

In [None]:
information_gain(df,'Alcohol')

In [None]:
from sklearn import datasets
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
# k = 4 tells four top features to be selected
# Score function Chi2 tells the feature to be selected using Chi Square
test = SelectKBest(score_func=mutual_info_classif, k=2)
fit = test.fit(X,Y)
fit.scores_

In [None]:
X_new=test.fit_transform(X, Y)
X_new

# Method 3: Removing features with low variance
Boolean features are Bernoulli random variables, and the variance of such variables is given by

Var(X)=p*(1-p)


If
p=1 (the feature is always 1), the variance is

1×(1−1)=0 → no variation.
If
p=0 (the feature is always 0), the variance is
0×(1−0)=0 → no variation.
If
p=0.5, meaning the feature is 1 50% of the time and 0 50% of the time, the variance is maximized:
Var
(
𝑋
)
=
0.5
×
(
1
−
0.5
)
=
0.25

VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.

As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples.

In this case,
p=0.8, so the variance is
0.8×0.2=0.16.

This means the threshold is set to 0.16. Any feature in the dataset with a variance less than 0.16 will be removed.


In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_new=sel.fit_transform(X)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

# Sample training dataset
train_data = {
    'Feature_A': [1, 1, 1, 1, 1],  # No variance, constant feature
    'Feature_B': [1, 1, 1, 0, 1],  # Low variance, mostly 1's
    'Feature_C': [3, 4, 5, 2, 1],  # High variance, more variability
    'Feature_D': [0, 0, 0, 1, 1],  # Binary feature
}

# Convert to DataFrame
X_train = pd.DataFrame(train_data)

# Set up VarianceThreshold with threshold = 0.16
sel = VarianceThreshold(threshold=0.8 * (1 - 0.8))

# Apply fit_transform to training data
X_train_new = sel.fit_transform(X_train)
# Show the resulting data after removing low variance features
print("\nData After VarianceThreshold:\n", X_train_new)

# Display which features were kept
#  get_support() might look like this: [False, True, True, True].
selected_columns = X_train.columns[sel.get_support()]
print("\nSelected Features:\n", selected_columns)

# Now we have a test dataset
test_data = {
    'Feature_A': [1, 1, 1],  # No variance (same as training)
    'Feature_B': [0, 1, 0],  # Low variance
    'Feature_C': [4, 3, 5],  # Higher variance
    'Feature_D': [0, 1, 0],  # Binary feature
}

# Convert to DataFrame
X_test = pd.DataFrame(test_data)

# Apply transform() on test data using the fitted selector from training
X_test_new = sel.transform(X_test)

# Display the transformed test data
print("\nTransformed Test Data:\n", X_test_new)



Data After VarianceThreshold:
 [[1 3 0]
 [1 4 0]
 [1 5 0]
 [0 2 1]
 [1 1 1]]

Selected Features:
 Index(['Feature_B', 'Feature_C', 'Feature_D'], dtype='object')

Transformed Test Data:
 [[0 4 0]
 [1 3 1]
 [0 5 0]]


# Method 4: Wrapper Approach (using Random Forest Classifiers)
Random forests uses embedded approach to rank the importance of variables in a regression or classification problem in a natural way.

1. The first step in measuring the feature importance in a data set is to fit a random forest to the data.
2. During the fitting process the out-of-bag error for each data point is recorded and averaged over the forest.
3. To measure the importance of the j-th feature after training, the values of the j-th feature are permuted among the training data and the out-of-bag error is again computed on this perturbed data set.
4. The importance score for the j-th feature is computed by averaging the difference in out-of-bag error before and after the permutation over all trees.
5. The score is normalized by the standard deviation of these differences.



Random Forest Classifer based feature selection can be used in two ways:
1. By training a random forest classifier which returns feature importances

In [None]:
# importing required libraries
# importing Scikit-learn library and datasets package
from sklearn import datasets

# Loading the iris plants dataset (classification)
iris = datasets.load_iris()

In [None]:
#importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
# creating dataframe of IRIS dataset
data = pd.DataFrame({'sepallength': iris.data[:, 0], 'sepalwidth': iris.data[:, 1],
                     'petallength': iris.data[:, 2], 'petalwidth': iris.data[:, 3],
                     'species': iris.target})

In [None]:
#dividing the datasets into two parts i.e. training datasets and test datasets
X, y = datasets.load_iris( return_X_y = True)

# Spliting arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
# i.e. 70 % training dataset and 30 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)


In [None]:
clf.feature_importances_

2. Using feature_selection class of sklearn

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize SelectFromModel with RandomForest
selector = SelectFromModel(RandomForestClassifier(n_estimators=100))

# Fit the model
selector.fit(X, y)

# Get the mask of selected features
selected_features_mask = selector.get_support()
print("Selected Features mask:", selected_features_mask)
# Display selected features
selected_features = pd.DataFrame(X, columns=iris.feature_names).columns[selected_features_mask]
print("Selected Features:", selected_features.tolist())


Selected Features mask: [False False  True  True]
Selected Features: ['petal length (cm)', 'petal width (cm)']
