# Classification Problem End-to-End using Scikit-Learn

In [1]:
## Step 1: Import Necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
## Step 2: Create a Synthetic Dataset

# Create a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, n_classes=2, random_state=42)

# Convert to DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
df['target'] = y

In [3]:
## Step 3: Data Cleaning

# Check for missing values
print(df.isnull().sum())

# No missing values in the synthetic dataset

feature_0     0

feature_1     0

feature_2     0

feature_3     0

feature_4     0

feature_5     0

feature_6     0

feature_7     0

feature_8     0

feature_9     0

feature_10    0

feature_11    0

feature_12    0

feature_13    0

feature_14    0

feature_15    0

feature_16    0

feature_17    0

feature_18    0

feature_19    0

target        0

dtype: int64


In [4]:
## Step 4: Exploratory Data Analysis (EDA)

# Basic statistics
print(df.describe())

         feature_0   feature_1   feature_2   feature_3   feature_4  feature_5  \

count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000     1000.0   

mean      0.007694    -0.028220    -0.025372    -0.022693     -0.031288        0.0   

std        0.977071     0.973869     1.018242     0.988278      1.003315        1.0   

min       -3.506630    -3.238711    -3.471189    -3.352754     -3.542096       -1.0   

25%       -0.606487    -0.686074    -0.705453    -0.701672     -0.705882       -1.0   

50%        0.004755    -0.013539    -0.041241    -0.010865     -0.027982        0.0   

75%        0.671467     0.615209     0.648697     0.634989      0.643979        1.0   

max        3.210818     3.102296     3.842128     3.291114      3.795230        1.0   



        feature_6   feature_7   feature_8   feature_9  ...  feature_12  \

count  1000.000000  1000.000000  1000.000000  1000.000000  ...  1000.000000   

mean      0.000292     0.026907     0.005692     0.040354  .

In [5]:
# Pairplot for a subset of features
sns.pairplot(df[['feature_0', 'feature_1', 'feature_2', 'feature_3', 'target']], hue='target')
plt.show()

In [6]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [7]:
## Step 5: Data Preprocessing

# Split the data
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
## Step 6: Model Building

# Initialize the model
rfc = RandomForestClassifier(random_state=42)

# Fit the model
rfc.fit(X_train_scaled, y_train)

In [9]:
## Step 7: Model Prediction

# Predictions
y_pred = rfc.predict(X_test_scaled)

In [10]:
## Step 8: Model Evaluation

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[86 10]

 [16 88]]


In [11]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support



           0       0.84      0.90      0.87        96

           1       0.90      0.85      0.87       104



    accuracy                           0.87       200

   macro avg       0.87      0.87      0.87       200

weighted avg       0.87      0.87      0.87       200


In [12]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.87


In [13]:
## Step 9: Hyperparameter Tuning

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Best Score: 0.8987499999999999


In [15]:
## Step 10: Final Model Evaluation

# Best model predictions
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)

# Confusion Matrix
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
print(conf_matrix_best)

[[86 10]

 [15 89]]


In [17]:
# Classification Report
class_report_best = classification_report(y_test, y_pred_best)
print(class_report_best)

              precision    recall  f1-score   support



           0       0.85      0.90      0.87        96

           1       0.90      0.86      0.88       104



    accuracy                           0.87       200

   macro avg       0.87      0.87      0.87       200

weighted avg       0.87      0.87      0.87       200


In [18]:
# Accuracy Score
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Accuracy: {accuracy_best}')

Accuracy: 0.875
