<a href="https://colab.research.google.com/github/bdi2357/StatisticalRebalancing/blob/main/Rebalancing_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!ls

sample_data


In [2]:
!mv archive\ \(13\).zip archive.zip

In [3]:
%%time
import os
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler


# Load the dataset into a pandas DataFrame
df = pd.read_csv('creditcard.csv')
print(df.head())

# Check for NaN values in the target variable
print(df['Class'].isna().sum())

# Drop rows where target variable is NaN
df = df.dropna(subset=['Class'])

# Prepare the dataset for comparison
X = df.drop('Class', axis=1)
y = df['Class']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [7]:
import os
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler


# Load the dataset into a pandas DataFrame
df = pd.read_csv('diabetes.csv')
print(df.shape)
print(df.head())

# Check for NaN values in the target variable
print(df['Outcome'].isna().sum())

# Drop rows where target variable is NaN
df = df.dropna(subset=['Outcome'])

# Prepare the dataset for comparison
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


(768, 9)
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
0
             Dataset          Classifier  f1_score  precision_score  \
7  RandomOverSampler  LogisticRegression  0.666667         0.716049   
5             ADASYN  LogisticRegression  0.637363         0.716049   
3              SMOTE  LogisticRegression  0.636364         0.691358   
2       

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from urllib.request import urlopen
from zipfile import ZipFile
import os

# Define the URL for the Breast Cancer Wisconsin (Original) Dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

# Download and load the dataset into a pandas DataFrame
column_names = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses', 'Class']
df = pd.read_csv(url, names=column_names)

# Replace missing values denoted by '?' with NaN and then handle them
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
print(df.shape)
df['Bare Nuclei'] = df['Bare Nuclei'].astype(int)

# Drop the 'ID' column as it is not needed for the analysis
df.drop('ID', axis=1, inplace=True)

# Prepare the dataset for comparison
X = df.drop('Class', axis=1)
y = df['Class']

# Convert the class labels to binary (2 -> 1 for benign, 4 -> 0 for malignant)
y = y.map({2: 1, 4: 0})

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame using pd.concat
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


(683, 11)
             Dataset          Classifier  f1_score  precision_score  \
6  RandomOverSampler        RandomForest  1.000000         0.962406   
4             ADASYN        RandomForest  1.000000         0.962406   
0           Original        RandomForest  0.992248         0.962406   
2              SMOTE        RandomForest  1.000000         0.954887   
3              SMOTE  LogisticRegression  0.984615         0.962406   
7  RandomOverSampler  LogisticRegression  0.984615         0.962406   
1           Original  LogisticRegression  0.969925         0.969925   
5             ADASYN  LogisticRegression  0.984496         0.954887   

   recall_score  roc_auc_score  Overall Score  
6      0.980843       0.992690       0.983985  
4      0.980843       0.990393       0.983410  
0      0.977099       0.992899       0.981163  
2      0.976923       0.992011       0.980955  
3      0.973384       0.994256       0.978665  
7      0.973384       0.993839       0.978561  
1      0.96992

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

# Define the URL for the Census Income Dataset
url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

# Define the column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                'hours-per-week', 'native-country', 'income']

# Load the datasets into pandas DataFrames
df_train = pd.read_csv(url_train, names=column_names, sep=',\s', na_values='?', engine='python')
df_test = pd.read_csv(url_test, names=column_names, sep=',\s', na_values='?', skiprows=1, engine='python')

# Concatenate train and test datasets
df = pd.concat([df_train, df_test], ignore_index=True)
print(df.shape)
# Handle missing values by dropping rows with NaNs
df.dropna(inplace=True)

# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Prepare the dataset for comparison
X = df.drop('income_>50K', axis=1)
y = df['income_>50K']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame using pd.concat
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


(48842, 15)
             Dataset          Classifier  f1_score  precision_score  \
6  RandomOverSampler        RandomForest  0.684783         0.699378   
2              SMOTE        RandomForest  0.699198         0.658526   
0           Original        RandomForest  0.748359         0.607460   
4             ADASYN        RandomForest  0.694234         0.652309   
5             ADASYN  LogisticRegression  0.211982         0.714920   
3              SMOTE  LogisticRegression  0.234683         0.602131   
7  RandomOverSampler  LogisticRegression  0.403438         0.312611   
1           Original  LogisticRegression  0.574074         0.027531   

   recall_score  roc_auc_score  Overall Score  
6      0.692004       0.936194       0.753090  
2      0.678253       0.933637       0.742404  
0      0.670588       0.936322       0.740682  
4      0.672619       0.930858       0.737505  
5      0.327003       0.654950       0.477214  
3      0.337733       0.652642       0.456798  
7      0.352

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
import requests
from io import StringIO

# Define the URLs for the Wine Quality Dataset
url_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

# Download the datasets
response_red = requests.get(url_red)
response_white = requests.get(url_white)

# Load the datasets into pandas DataFrames
df_red = pd.read_csv(StringIO(response_red.text), sep=';')
df_white = pd.read_csv(StringIO(response_white.text), sep=';')

# Add a column to distinguish between red and white wine
df_red['wine_type'] = 'red'
df_white['wine_type'] = 'white'

# Concatenate the datasets
df = pd.concat([df_red, df_white], ignore_index=True)
print(df.shape)
# Check for NaN values and handle them if any
print(df.isna().sum())

# Assuming no NaN values for this dataset, if there were any, you would handle them like this:
# df.dropna(inplace=True) or df.fillna(method='ffill', inplace=True)

# Convert the wine quality into a binary classification problem
# For simplicity, let's classify quality > 6 as "good" (1) and <= 6 as "not good" (0)
df['quality'] = np.where(df['quality'] > 6, 1, 0)

# Convert categorical feature 'wine_type' using one-hot encoding
df = pd.get_dummies(df, columns=['wine_type'], drop_first=True)

# Prepare the dataset for comparison
X = df.drop('quality', axis=1)
y = df['quality']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame using pd.concat
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


(6497, 13)
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


             Dataset          Classifier  f1_score  precision_score  \
6  RandomOverSampler        RandomForest  0.751553         0.631854   
2              SMOTE        RandomForest  0.648325         0.707572   
4             ADASYN        RandomForest  0.639344         0.712794   
0           Original        RandomForest  0.793358         0.561358   
5             ADASYN  LogisticRegression  0.405063         0.751958   
7  RandomOverSampler  LogisticRegression  0.391892         0.757180   
3              SMOTE  LogisticRegression  0.395028         0.746736   
1           Original  LogisticRegression  0.614865         0.237598   

   recall_score  roc_auc_score  Overall Score  
6      0.686525       0.915868       0.746450  
2      0.676654       0.911709       0.736065  
4      0.674074       0.911425       0.734409  
0      0.657492       0.916557       0.732191  
5      0.526508       0.806064       0.622398  
7      0.516474       0.804049       0.617399  
3      0.516712       0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
%%time
# Install necessary libraries
!pip install pandas scikit-learn imbalanced-learn

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

# Load the dataset into a pandas DataFrame
df = pd.read_csv('train.csv')
print(df.shape)
print(df.head())

# Check for NaN values in the target variable
print(df['target'].isna().sum())

# Drop rows where target variable is NaN
df = df.dropna(subset=['target'])

# Prepare the dataset for comparison
X = df.drop('target', axis=1)
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test)
        probabilities = clf.predict_proba(X_test)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


(595212, 59)
   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  ...  \
0              0              0              1              0  ...   
1              0              0              0              1  ...   
2              0              0              0              1  ...   
3              0              1              0              0  ...   
4              0              1              0              0  ...   

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9           1           5           8               0   
1           3

In [11]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('train.csv')

# Display the first few rows
print(df.head())

# Check for NaN values in the target variable
print(df['target'].isna().sum())

# Drop rows where target variable is NaN
df = df.dropna(subset=['target'])

# Prepare the dataset for comparison
X = df.drop(['id', 'target'], axis=1)
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the resampling techniques
resampling_techniques = {
    'Original': (X_train, y_train),
    'SMOTE': SMOTE(random_state=42).fit_resample(X_train, y_train),
    'ADASYN': ADASYN(random_state=42).fit_resample(X_train, y_train),
    'RandomOverSampler': RandomOverSampler(random_state=42).fit_resample(X_train, y_train)
}

# Define classifiers to test
classifiers = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Define metrics for evaluation
metrics = [f1_score, precision_score, recall_score, roc_auc_score]

# Store results
results = pd.DataFrame(columns=['Dataset', 'Classifier'] + [metric.__name__ for metric in metrics])

# Evaluate each resampled dataset with each classifier
for resampling_name, (X_res, y_res) in resampling_techniques.items():
    # Standardize features
    scaler = StandardScaler()
    X_res = scaler.fit_transform(X_res)
    X_test_scaled = scaler.transform(X_test)

    for clf_name, clf in classifiers.items():
        clf.fit(X_res, y_res)
        predictions = clf.predict(X_test_scaled)
        probabilities = clf.predict_proba(X_test_scaled)[:, 1]  # For ROC AUC

        # Evaluate with each metric
        precision = precision_score(y_test, predictions, zero_division=1)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, zero_division=1)
        roc_auc = roc_auc_score(y_test, probabilities)

        # Append to results DataFrame
        row = pd.DataFrame([[resampling_name, clf_name, precision, recall, f1, roc_auc]], columns=results.columns)
        results = pd.concat([results, row], ignore_index=True)

# Calculate an overall score for ranking (optional)
results['Overall Score'] = results[[metric.__name__ for metric in metrics]].mean(axis=1)

# Sort results for ranking
ranked_results = results.sort_values(by='Overall Score', ascending=False)

# Display the ranked table
print(ranked_results)


   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  ...  \
0              0              0              1              0  ...   
1              0              0              0              1  ...   
2              0              0              0              1  ...   
3              0              1              0              0  ...   
4              0              1              0              0  ...   

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9           1           5           8               0   
1           3           1 