In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()

# Create a DataFrame with feature data and feature names as column headers
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column to the DataFrame
df['target'] = data.target

# Display the first few rows of the DataFrame and its info
print(df.head())
print(df.info())
print("Missing values:\n",df.isnull().sum())

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [5]:
from sklearn.preprocessing import StandardScaler  # Import the StandardScaler for feature scaling

# Separate features (x) and target (y)
x = df.drop(columns=['target'])  # Drop the 'target' column from the DataFrame to get feature data
y = df['target']  # Define 'target' as the target variable

# Store the column names of x to keep track after scaling
x_columns = x.columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the feature data
x_scaled = scaler.fit_transform(x)  # Fit the scaler to x and transform x to get scaled data
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model as the estimator
estimator = LogisticRegression(max_iter=100)

# Set up Recursive Feature Elimination (RFE) with the specified estimator
# and the number of features to select
selector = RFE(estimator, n_features_to_select=10)

# Fit the RFE selector on the scaled features and target
selector = selector.fit(x_scaled, y)

# Extract the selected feature names
selected_features = x_columns[selector.support_]

# Display the selected features
print("Selected features:", selected_features)


Selected features: Index(['mean concave points', 'radius error', 'area error',
       'compactness error', 'worst radius', 'worst texture', 'worst perimeter',
       'worst area', 'worst concavity', 'worst concave points'],
      dtype='object')


In [8]:
# Create a DataFrame with the selected features after RFE transformation
x_selected = pd.DataFrame(selector.transform(x_scaled), columns=selected_features)

# Concatenate the selected features DataFrame with the target column
df_selected = pd.concat([x_selected, df['target']], axis=1)

# Display the first few rows of the new DataFrame
print("RFE DF Head:\n", df_selected.head())


RFE DF Head:
    mean concave points  radius error  area error  compactness error  \
0             2.532475      2.489734    2.487578           1.316862   
1             0.548144      0.499255    0.742402          -0.692926   
2             2.037231      1.228676    1.181336           0.814974   
3             1.451707      0.326373   -0.288378           2.744280   
4             1.428493      1.270543    1.190357          -0.048520   

   worst radius  worst texture  worst perimeter  worst area  worst concavity  \
0      1.886690      -1.359293         2.303601    2.001237         2.109526   
1      1.805927      -0.369203         1.535126    1.890489        -0.146749   
2      1.511870      -0.023974         1.347475    1.456285         0.854974   
3     -0.281464       0.133984        -0.249939   -0.550021         1.989588   
4      1.298575      -1.466770         1.338539    1.220724         0.613179   

   worst concave points  target  
0              2.296076       0  
1         

In [10]:
#Result before rfe
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)
model=LogisticRegression(max_iter=100)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print("Model performance before RFE")
print("Accuary:",accuracy_score(y_test,y_pred))
print("Classification report:",classification_report(y_test,y_pred))
      

Model performance before RFE
Accuary: 0.9736842105263158
Classification report:               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [11]:
#Results after RFE
x_train,x_test,y_train,y_test=train_test_split(x_selected,y,test_size=0.2,random_state=42)
model=LogisticRegression(max_iter=100)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print("Model performace after RFE")
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Classification Report:",classification_report(y_test,y_pred))

Model performace after RFE
Accuracy: 0.9736842105263158
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

