<a href="https://colab.research.google.com/github/boazgajja/-Realtime-Ecommerce-Data-Pipeline-Analaytics-Dashboard/blob/main/feature_extraction(diabaties).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSE 303/feature extraction /diabetes.csv')
y = df['Outcome']
df = df.drop('Outcome', axis=1)

In [7]:
features_train, features_test, target_train, target_test = train_test_split(df, y, test_size=0.3, random_state=42)

classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(features_train, target_train)

predictions = classifier.predict(features_test)
score = accuracy_score(target_test, predictions)

print(f'Accuracy Before: {score}')


Accuracy Before: 0.7012987012987013


In [8]:
# 1. Missing Values Ratio (Threshold 30%)

missing_ratio = df.eq(0).sum() / len(df) * 100
print('Missing ratio per feature:\n', missing_ratio)

missing_ratio
features = df.loc[:, missing_ratio <= 30]

features_train, features_test, target_train, target_test = train_test_split(features, y, test_size=0.3, random_state=42)

classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(features_train, target_train)

predictions = classifier.predict(features_test)
score = accuracy_score(target_test, predictions)

print(f'\nAccuracy After removing features with >30% missing values: {score}')


Missing ratio per feature:
 Pregnancies                 14.453125
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
dtype: float64

Accuracy After removing features with >30% missing values: 0.70995670995671


In [9]:
# 3. High Correlation Filter (Correlation > 0.8)

import numpy as np

# Calculate the correlation matrix
corr_matrix = df.corr().abs()

# Identify pairs of highly correlated features (correlation > 0.8)
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
features_to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]

print(f'Highly correlated features to drop: {features_to_drop}')

# Drop highly correlated features
df_reduced = df.drop(columns=features_to_drop)

# Split data and train model using the reduced dataset
features_train, features_test, target_train, target_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# Training the model
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(features_train, target_train)

# Predicting the model and the accuracy on the reduced feature set
predictions = classifier.predict(features_test)
score = accuracy_score(target_test, predictions)

print(f'Accuracy after removing highly correlated features: {score}')


Highly correlated features to drop: []
Accuracy after removing highly correlated features: 0.7467532467532467


In [10]:
# 5. Low Variance Filter

from sklearn.feature_selection import VarianceThreshold

# low variance filter
selector = VarianceThreshold(threshold=0.01)
features_reduced = selector.fit_transform(df)
target = y

# Train model
features_train, features_test, target_train, target_test = train_test_split(features_reduced, target, test_size=0.2, random_state=42)

classifier.fit(features_train, target_train)
predictions = classifier.predict(features_test)
score = accuracy_score(target_test, predictions)
print(f'Accuracy after applying low variance filter: {score}')


Accuracy after applying low variance filter: 0.7467532467532467


In [11]:
# 7. Forward Feature Selection

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# Forward feature selection using logistic regression
model = LogisticRegression(max_iter=1000)
sfs = SequentialFeatureSelector(model, n_features_to_select="auto", direction='forward')

features = df
target = y

# Apply forward selection
sfs.fit(features, target)
features_selected = sfs.transform(features)

# Train model on selected features
features_train, features_test, target_train, target_test = train_test_split(features_selected, target, test_size=0.2, random_state=42)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
score = accuracy_score(target_test, predictions)
print(f'Accuracy with forward feature selection: {score}')


Accuracy with forward feature selection: 0.7532467532467533


In [12]:
# 9. Backward Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Recursive feature elimination using decision tree classifier
model = DecisionTreeClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)  # Select top 5 features

features = df
target = y

# Fit RFE
rfe.fit(features, target)
features_reduced = rfe.transform(features)

# Train model on selected features
features_train, features_test, target_train, target_test = train_test_split(features_reduced, target, test_size=0.2, random_state=42)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
score = accuracy_score(target_test, predictions)
print(f'Accuracy after backward feature elimination: {score}')


Accuracy after backward feature elimination: 0.7077922077922078


In [13]:
# 11. Random Forest Feature Importance

from sklearn.ensemble import RandomForestClassifier

# Train random forest classifier
model = RandomForestClassifier(random_state=42)
features = df
target = y

model.fit(features, target)

# Feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[-5:]  # Keep top 5 features

# Select top 5 important features
features_reduced = features.iloc[:, indices]

# Train model on top 5 features
features_train, features_test, target_train, target_test = train_test_split(features_reduced, target, test_size=0.2, random_state=42)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
score = accuracy_score(target_test, predictions)
print(f'Accuracy after keeping top 5 important features: {score}')


Accuracy after keeping top 5 important features: 0.7532467532467533
