In [4]:
import pandas as pd
import numpy as  np


In [99]:
data = pd.read_csv("DataPreprocessingGraded_dataset.csv")

# Replacing ? in the data with NaN
data.replace('?', np.nan, inplace=True)

In [102]:
y = data[['Target']]

In [None]:
y

In [104]:
data= data.drop(columns=['Target'])

In [76]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [77]:
imputer = SimpleImputer(strategy="mean")
preprocessor = ColumnTransformer(
    transformers=[
        ('mean_imputer', imputer, [0, 1])  # Columns 0 and 1
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the data
transformed_data = preprocessor.fit_transform(data)

In [78]:
data = pd.DataFrame(transformed_data, columns = [['V1', 'V2', 'V3', 'V4', 'V5']])

In [79]:
data

Unnamed: 0,V1,V2,V3,V4,V5
0,2.0,50.0,12500.0,98.0,NEGATIVE
1,0.0,13.0,3250.0,28.0,NEGATIVE
2,9.562584,5.464334,4000.0,35.0,NEGATIVE
3,9.562584,20.0,5000.0,45.0,NEGATIVE
4,1.0,24.0,6000.0,77.0,NEGATIVE
...,...,...,...,...,...
743,23.0,2.0,500.0,38.0,NEGATIVE
744,21.0,2.0,500.0,52.0,NEGATIVE
745,23.0,3.0,750.0,62.0,NEGATIVE
746,39.0,1.0,250.0,39.0,NEGATIVE


In [80]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()



In [81]:
# Apply StandardScaler to columns 0, 1, 2, and 3
preprocessor_1 = ColumnTransformer(
    transformers=[
        ('scaler', scaler, [0, 1, 2, 3])  # Columns 0, 1, 2, 3
    ],
    remainder='passthrough'  # Keep other columns unchanged if any exist
)

# Fit and transform the data
scaled_data = preprocessor_1.fit_transform(data)

# Convert the result to a DataFrame (optional, for better readability)
data = pd.DataFrame(scaled_data, columns=['V1', 'V2', 'V3', 'V4', 'V5'])

In [82]:
data

Unnamed: 0,V1,V2,V3,V4,V5
0,-0.938169,7.709867,7.623346,2.615633,NEGATIVE
1,-1.186278,1.304549,1.282738,-0.257881,NEGATIVE
2,0.0,-0.0,1.796842,0.029471,NEGATIVE
3,0.0,2.516366,2.482313,0.439973,NEGATIVE
4,-1.062223,3.208833,3.167784,1.753579,NEGATIVE
...,...,...,...,...,...
743,1.666966,-0.599734,-0.602307,0.152621,NEGATIVE
744,1.418858,-0.599734,-0.602307,0.727324,NEGATIVE
745,1.666966,-0.426617,-0.43094,1.137826,NEGATIVE
746,3.651831,-0.772851,-0.773675,0.193671,NEGATIVE


In [83]:
from sklearn.preprocessing import OrdinalEncoder
# Define the OrdinalEncoder for column 4
ordinal_encoder = OrdinalEncoder()

# Apply OrdinalEncoder to column 4
preprocessor_2 = ColumnTransformer(
    transformers=[
        ('ordinal_encoder', ordinal_encoder, [4])  # Column 4
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the data
encoded_data = preprocessor_2.fit_transform(data)

In [85]:
data = pd.DataFrame(encoded_data, columns=['V1', 'V2', 'V3', 'V4', 'V5'])


In [86]:
encoded_data

array([[0.0, -0.9381693902487627, 7.709866528281595, 7.62334626135984,
        2.6156334449690104],
       [0.0, -1.186277543640765, 1.3045494920474057, 1.2827382634817053,
        -0.257880899643511],
       [0.0, 0.0, -1.53758496295156e-16, 1.7968416146610138,
        0.02947053481774113],
       ...,
       [0.0, 1.6669662203672622, -0.42661727450237547,
        -0.4309395737826555, 1.1378260677397136],
       [0.0, 3.6518314475032807, -0.7728506278123317,
        -0.7736751412355277, 0.1936713545098852],
       [0.0, 7.74561597847132, -0.7728506278123317, -0.7736751412355277,
        1.5483281169700738]], dtype=object)

In [87]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5
0,0.0,-0.938169,7.709867,7.623346,2.615633
1,0.0,-1.186278,1.304549,1.282738,-0.257881
2,0.0,0.0,-0.0,1.796842,0.029471
3,0.0,0.0,2.516366,2.482313,0.439973
4,0.0,-1.062223,3.208833,3.167784,1.753579


In [54]:
# Feature Union

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [105]:
# Imputer for missing values (for numerical columns)
imputer = SimpleImputer(strategy='mean')

# StandardScaler for numerical columns (0, 1, 2, 3)
scaler = StandardScaler()

# OrdinalEncoder for categorical column (4)
ordinal_encoder = OrdinalEncoder()

# Pipeline for numerical features (columns 0, 1, 2, 3)
numerical_pipeline = Pipeline(steps=[
    ('imputer', imputer),  # First impute missing values
    ('scaler', scaler)     # Then scale the numerical features
])

# Pipeline for categorical features (column 4)
categorical_pipeline = Pipeline(steps=[
    ('ordinal_encoder', ordinal_encoder)  # Ordinal encoding for the categorical feature
])

# Use FeatureUnion to combine numerical and categorical pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, [0, 1, 2, 3]),  # Apply numerical pipeline to columns 0, 1, 2, 3
    ('cat', categorical_pipeline, [4])          # Apply categorical pipeline to column 4
])

# Apply the preprocessor to the data
transformed_data = preprocessor.fit_transform(data)

# Convert the result to a DataFrame (optional, for better readability)

In [106]:
data = pd.DataFrame(transformed_data, columns=['V1', 'V2', 'V3', 'V4', 'V5'])

In [107]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5
0,-0.938169,7.709867,7.623346,2.615633,0.0
1,-1.186278,1.304549,1.282738,-0.257881,0.0
2,0.0,-1.537585e-16,1.796842,0.029471,0.0
3,0.0,2.516366,2.482313,0.439973,0.0
4,-1.062223,3.208833,3.167784,1.753579,0.0


In [108]:
from sklearn.feature_selection import VarianceThreshold

In [110]:
# Apply VarianceThreshold with threshold=0.1
selector = VarianceThreshold(threshold=0.1)

# Fit and transform the data
reduced_data = selector.fit_transform(data)

# Get the features that were kept
features_kept = selector.get_support(indices=True)

# Convert the result to a DataFrame (optional, for better readability)
reduced_df = pd.DataFrame(reduced_data, columns=[f"V{idx+1}" for idx in features_kept])

In [111]:
reduced_df.head()

Unnamed: 0,V1,V2,V3,V4
0,-0.938169,7.709867,7.623346,2.615633
1,-1.186278,1.304549,1.282738,-0.257881
2,0.0,-1.537585e-16,1.796842,0.029471
3,0.0,2.516366,2.482313,0.439973
4,-1.062223,3.208833,3.167784,1.753579


In [117]:
X= data

In [114]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [115]:
encoder = OrdinalEncoder()
y_encoded = encoder.fit_transform(y)

In [118]:
# Initialize the LogisticRegression model
logreg = LogisticRegression()

# Use RFE to select the 2 most important features
rfe = RFE(estimator=logreg, n_features_to_select=2)
rfe.fit(X, y_encoded.ravel())  # .ravel() converts y to a 1D array

# Get the selected feature rankings (1 means selected)
selected_features = rfe.support_

# Get feature ranking (1 means selected)
ranking = rfe.ranking_

# Show results
print(f"Selected Features: {X.columns[selected_features]}")
print(f"Feature Ranking: {ranking}")

Selected Features: Index(['V1', 'V3'], dtype='object')
Feature Ranking: [1 3 1 2 4]


In [120]:
from sklearn.feature_selection import SequentialFeatureSelector

In [124]:
# Initialize the LogisticRegression model
logreg = LogisticRegression()

# Sequential Feature Selection (SFS) with LogisticRegression as the estimator
sfs = SequentialFeatureSelector(logreg,
                                n_features_to_select=2,   # Number of features to select
                                direction='backward',     # Forward selection
                                scoring='accuracy',      # Scoring metric (accuracy)
                                cv=2)                    # No cross-validation for simplicity

# Perform SFS and fit the model
sfs.fit(X, y_encoded.ravel())  # .ravel() converts y to a 1D array

# Get the indices of the selected features
selected_features = sfs.get_support(indices=True)

# Print the indices of the two most important features
print(f"Indices of the two most important features: {selected_features}")

Indices of the two most important features: [2 3]


In [125]:
# Initialize the LogisticRegression model
logreg = LogisticRegression()

# Sequential Feature Selection (SFS) with LogisticRegression as the estimator
sfs = SequentialFeatureSelector(logreg,
                                n_features_to_select=2,   # Number of features to select
                                direction='forward',     # Forward selection
                                scoring='accuracy',      # Scoring metric (accuracy)
                                cv=2)                    # No cross-validation for simplicity

# Perform SFS and fit the model
sfs.fit(X, y_encoded.ravel())  # .ravel() converts y to a 1D array

# Get the indices of the selected features
selected_features = sfs.get_support(indices=True)

# Print the indices of the two most important features
print(f"Indices of the two most important features: {selected_features}")

Indices of the two most important features: [1 2]
