<a href="https://colab.research.google.com/github/cagBRT/Data/blob/main/Imbalanced_Dataset_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone the entire repo.
!git clone -s https://github.com/cagBRT/Data.git cloned-repo
%cd cloned-repo

In [None]:
import pandas as pd
from pandas import read_csv
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from matplotlib.ticker import FormatStrFormatter
from sklearn import model_selection
from sklearn import metrics

In [None]:
from numpy import mean
from numpy import isnan
from numpy import asarray
from numpy import polyfit
from scipy.stats import pearsonr

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
filename = '/content/cloned-repo/survivalData.csv'

**Loading the data**<br>

Relevant Information: The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.<br>

Attribute Information:<br>

Age of patient at time of operation (numerical)<br>

Patient's year of operation (year - 1900, numerical)<br>

Number of positive axillary nodes detected (numerical)<br>

Survival status (class attribute)<br>

>1 = the patient survived 5 years or longer<br>

>2 = the patient died within 5 year<br>

In [None]:
columns = ['age', 'year', 'nodes', 'class']
dataframe = pd.read_csv(filename, header=None, names=columns)

In [None]:
dataframe.columns

In [None]:
#dataframe.plot()

In [None]:
dataframe['age'].value_counts()

In [None]:
dataframe['year'].value_counts()

In [None]:
dataframe['nodes'].value_counts()

In [None]:
report = dataframe.describe()
print(report)

In [None]:
dataframe.hist()
plt.show()

**Counter the labels for each class**

In [None]:
target = dataframe['class'].values
counter = Counter(target)
for k,v in counter.items():
  per = v / len(target) * 100
  print('Class=%d, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
array = dataframe.values
X = array[:,:3]
Y = array[:,3]
#validation_size = 0.30
#seed = 10
#X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,
#test_size=validation_size, random_state=seed)

In [None]:
# Test options and evaluation metric
num_folds = 20
#num_instances = len(X_train)
seed = 10
scoring = 'accuracy'

In [None]:
#create a list of models to evaluate
def get_models():
	models = list()
	models.append(LogisticRegression())
	models.append(RidgeClassifier())
	models.append(SGDClassifier())
	models.append(PassiveAggressiveClassifier())
	models.append(KNeighborsClassifier())
	models.append(DecisionTreeClassifier())
	models.append(ExtraTreeClassifier())
	models.append(LinearSVC())
	models.append(SVC())
	models.append(GaussianNB())
	models.append(AdaBoostClassifier())
	models.append(BaggingClassifier())
	models.append(RandomForestClassifier())
	models.append(ExtraTreesClassifier())
	models.append(GaussianProcessClassifier())
	models.append(GradientBoostingClassifier())
	models.append(LinearDiscriminantAnalysis())
	models.append(QuadraticDiscriminantAnalysis())
	return models

In [None]:
# evaluate the model using a given test condition
def evaluate_model(cv, model,X,y):
	# evaluate the model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	# return scores
	return mean(scores)

In [None]:
# define test conditions
ideal_cv = LeaveOneOut()
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# get the list of models to consider
models = get_models()
# collect results
ideal_results, cv_results = list(), list()
# evaluate each model
for model in models:
	# evaluate model using each test condition
	cv_mean = evaluate_model(cv, model,X,Y)
	ideal_mean = evaluate_model(ideal_cv, model,X,Y)
	# check for invalid results
	if isnan(cv_mean) or isnan(ideal_mean):
		continue
	# store results
	cv_results.append(cv_mean)
	ideal_results.append(ideal_mean)
	# summarize progress
	print('>%s: ideal=%.3f, cv=%.3f' % (type(model).__name__, ideal_mean, cv_mean))