## Feature selection - Univariate Selection

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd

# Apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=100)

df = pd.read_csv("6.Merge_all_features_5477_with_class.csv")

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

fit = bestfeatures.fit(X, y)  # X, y are your data

dfscores = pd.DataFrame(fit.scores_)

dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)

featureScores.columns = ['Specs', 'Score']  # Naming the dataframe columns

print(featureScores.nlargest(100,'Score'))  # Print 10 best features

featureScores.nlargest(100,'Score')

df_best_100_univariate_selection = featureScores.nlargest(100,'Score')

## Feature Importance Ranking

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import pandas as pd

# Assuming X and y are your data
model = ExtraTreesClassifier()

df = pd.read_csv("0.Merge_all_features_5477_with_class.csv")

X.head(3)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X.head(3)
y.head(3)

model.fit(X, y)

# Get feature importances
feat_importances = pd.Series(model.feature_importances_, index=X.columns)

# Sort the features based on importance
sorted_feat_importances = feat_importances.sort_values(ascending=False)

# Select top 100 features
top_100_features = sorted_feat_importances.head(100)

# Select these features from your original dataset
X_reduced = X[top_100_features.index]

# Plot the top 10 features for visualization
top_100_features.head(20).plot(kind='barh')
plt.show()

top_100_features

# Now X_reduced contains your dataset with the top 100 features
df_best_100_ranked_features = top_100_features.to_frame()

df_best_100_ranked_features

## Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pandas as pd

df = pd.read_csv("6.Merge_all_features_5477_with_class.csv")

# Initialize the model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed

# Initialize RFE
rfe = RFE(model, n_features_to_select=100)  # Selecting top 100 features

# Fit RFE
fit = rfe.fit(X, y)

# Get the ranking of features
feature_ranking = fit.ranking_

# Get the mask of selected features
selected_features_mask = fit.support_

# Extract the selected feature names
selected_feature_names = X.columns[selected_features_mask]

# Create a DataFrame for the selected features
df_selected_features = X[selected_feature_names]