# Algorithmic Bias- Core Code

## Q1. 

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn import datasets
from sklearn.model_selection import train_test_split
bcDB = datasets.load_breast_cancer()

In [None]:
bcDF = pd.DataFrame(bcDB.data, columns= list(bcDB['feature_names']))
bcDF['target'] = pd.Series(bcDB.target)
bcDF = bcDF.sort_values(by = ['target'])
bcDF = bcDF.reset_index(drop=True)
bcDF.head(5)

In [None]:
vc = bcDF['target'].value_counts()
for i,j in enumerate(bcDB.target_names):
    print (vc[i],j)

In [None]:
y = bcDF.pop('target').values
X = bcDF.values
X.shape, y.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
print(X_train.shape,X_test.shape)

### Method for creating Confusion Matrix and Printing False Positive(FP) Rate

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
%matplotlib inline

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fp_rate = cm[0,1]/(cm[0,1]+cm[0,0])
    print('FP Rate is:', fp_rate)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="black" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


np.set_printoptions(precision=2)


## Hold-Out Method

### In Hold-out stratergy we keep some training data back (the hold-out set) to use for evaluating the model produced by the classifier.

### kNN

In [None]:
#Hold Out
kNN = KNeighborsClassifier(n_neighbors=3)
y_pred = kNN.fit(X_train, y_train).predict(X_test)
print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(criterion='entropy')
dt_tree = DT.fit(X_train,y_train)
y_pred = dt_tree.predict(X_test)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
bc_NB = gnb.fit(X_train,y_train)
y_test= bc_NB.predict(X_test)
print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logReg=LogisticRegression()
y_pred=logReg.fit(X_train,y_train).predict(X_test)
y_test.sum()/len(y_test)
print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

## Cross Validation Method

### Cross-validation or ‘k-fold cross-validation’ is when the dataset is randomly split up into ‘k’ groups. One of the groups is used as the test set and the rest are used as the training set. The model is trained on the training set and scored on the test set. Then the process is repeated until each unique group as been used as the test set.

### kNN

In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict
import matplotlib.pyplot as plt

In [None]:
kNN_scores = cross_val_score(kNN, X, y, cv=10, scoring='f1')
print("10x CV Accuracy kNNs: {0:.2f}".format(kNN_scores.mean())) 
y_pred = cross_val_predict(kNN, X, y, cv=10)
print("Malignant in test set : %0.2f" % (1- (y.sum()/len(y))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')
fpr_knn, tpr_knn, t_knn = roc_curve(y, y_pred)
roc_auc_knn = auc(fpr_knn, tpr_knn)

### Decision Tree

In [None]:
tree_scores = cross_val_score(DT, X, y, cv=10, scoring='f1')
print("10x CV Accuracy Trees: {0:.2f}".format(tree_scores.mean())) 
y_pred = cross_val_predict(DT, X, y, cv=10)
print("Malignant in test set : %0.2f" % (1- (y.sum()/len(y))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')
fpr_dt, tpr_dt, t_dt = roc_curve(y, y_pred)
roc_auc_dt = auc(fpr_dt, tpr_dt)

### Naive Bayes

In [None]:
gnb_scores = cross_val_score(gnb, X, y, cv=10,scoring='f1')
print("10x CV Accuracy Naive: {0:.2f}".format(gnb_scores.mean())) 
y_pred = cross_val_predict(gnb, X, y, cv=10)

print("Malignant in test set : %0.2f" % (1- (y.sum()/len(y))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')
#y_pred = gnb.fit(X_train, y_train).predict_proba(y.rehsape(1,-1))
fpr_gnb, tpr_gnb, t_gnb = roc_curve(y, y_pred)
roc_auc_gnb = auc(fpr_gnb, tpr_gnb)

### Logistic Regression

In [None]:
logReg_scores = cross_val_score(logReg, X, y, cv=10,scoring='f1')
print("10x CV Accuracy Logistic Regression: {0:.2f}".format(logReg_scores.mean())) 
y_pred = cross_val_predict(logReg, X, y, cv=10)

print("Malignant in test set : %0.2f" % (1- (y.sum()/len(y))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')
fpr_lr, tpr_lr, t_lr = roc_curve(y, y_pred)
roc_auc_lr = auc(fpr_lr, tpr_lr)

# FP & TP Rate Calculation

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn),
           'fp' : make_scorer(fp), 'fn' : make_scorer(fn)}

print(tp)
models = [kNN,DT,gnb,logReg]

folds = 4
v = 0 #  use 1 or 0

for m in models:
    cv_results = cross_validate(m, X, y, cv= folds,scoring=scoring, return_train_score=False, 
                                    verbose = v, n_jobs = -1)
    fp_rate = cv_results['test_fp'].sum()/(cv_results['test_fp'].sum()+cv_results['test_tn'].sum())
    tp_rate = cv_results['test_tp'].sum()/(cv_results['test_tp'].sum()+cv_results['test_fn'].sum())
  
    print("{} x CV {:22} FP: {:.2f}  TP: {:.2f}".format(folds, type(m).__name__, fp_rate, tp_rate)) 


## Conclusion
### Looking at the results of both the methods for all  algorithms, it can be seen that cross validation performs better with high accuracy rates. However, it can be seen that malignant is a minority class but still our predicted values are less then the actual test values showing that these algorithms are biased over majority class. 

### Also over all the algorithms FP rate for kNN is highest. Hence kNN is the most biased among these

# RoC curve ploting

In [None]:
%matplotlib inline
plt.figure(figsize = (18,9), dpi=250)
lw = 7
plt.plot(fpr_knn, tpr_knn, color='red',
         lw=lw, label='ROC kNN (area = %0.2f)' % roc_auc_knn)

plt.plot(fpr_dt, tpr_dt, color='green',
         lw=lw, label='ROC DecisionTree (area = %0.2f)' % roc_auc_dt)

plt.plot(fpr_gnb, tpr_gnb, color='blue',
         lw=lw, label='ROC GaussianNB (area = %0.2f)' % roc_auc_gnb)

plt.plot(fpr_lr, tpr_lr, color='orange',
         lw=lw, label='ROC LogisticRegression (area = %0.2f)' % roc_auc_lr)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.title('ROC Analysis for Hotel Review data', fontsize=25)
plt.legend(loc="lower right")
plt.show()

# Q2. Oversampling stratergy for Imbalance Dataset

### Class Imbalance Dataset: When the distribution of classes present in a data is not uniform such that the number of instances of a class significantly out numbers the instances of another class leads to class imbalance

### To recifying the bias there are several methods available, but I have used Random Oversampling.

In [None]:
from imblearn.over_sampling import RandomOverSampler
ran_ovr_samp = RandomOverSampler(random_state = 0)
X_newsample, y_newsample = ran_ovr_samp.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_newsample, y_newsample, random_state = 2 , stratify = y_newsample)

### kNN

In [None]:
y_pred = kNN.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Decision Trees

In [None]:
y_pred = DT.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Gaussian Naive Bayes

In [None]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### Logarithmic Regression

In [None]:
y_pred = logReg.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Malignant','Benign'], normalize=True,
                      title='Normalized confusion matrix')

### FP rates of each Algorithms:
##### kNN : 0.0888
##### Decision Tree: 0.0333
##### Gaussian Naive Bayes: 0.0777
##### Logarithmic Regression: 0.0222

#### Random Oversampling: This method works with minority class. It replicates the observations from minority class to balance the data. It randomly oversampling the minority class. An advantage of using this method is that it leads to no information loss
#### After applying random oversampling  the FP rates significantly decreases for each algorithm implying that algorithms are now less biased.The disadvantage of using this method is that, since oversampling simply adds replicated observations in original data set, it ends up adding multiple observations of several types, thus leading to overfitting. Although, the training accuracy of such data set will be high, but the accuracy on unseen data will be worse.

#### Based on the results the FP rate of Logarithmic Regression least hence performing best in my case.

# Q3.

## Classification Algorithm on Cryotherapy Dataset

#### Crytherapy Dataset: This dataset is used to classify whether the person was treated successfully or not based on six features age,sex,time,Number_of_Warts,type and area. 

In [None]:
import pandas as pd
path = r"D:\Study\ML_Python\Assignment\divorce\Cryotherapy.xlsx"
cryo_df= pd.read_excel(path)
cryo_df.shape

In [None]:
cryo_yes = cryo_df['Result_of_Treatment'].sum()
cryo_no  = len(cryo_df['Result_of_Treatment']) - cryo_df['Result_of_Treatment'].sum()
print ('Result Yes: ', cryo_yes)
print ('Result No: ',cryo_no)

In [None]:
y = cryo_df.pop('Result_of_Treatment').values
X = cryo_df.values
X.shape, y.shape

### I have applied both hold-out and Oversampling method on this dataset to which works better because the dataset is imbalance as it has more number of patients that are not treated correctly vs patients that were treated correctly.

## Holdout method

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### kNN

In [None]:
kNN = KNeighborsClassifier(n_neighbors=3)
y_pred = kNN.fit(X_train, y_train).predict(X_test)
print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Decision Tree

In [None]:
DT = DecisionTreeClassifier(criterion='entropy')
y_pred = DT.fit(X_train, y_train).predict(X_test)
print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))
print ("Accuracy Score: ", accuracy_score(y_test,y_pred))
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Logistic Regression

In [None]:
logReg=LogisticRegression()
y_pred = logReg.fit(X_train, y_train).predict(X_test)

print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

## OverSampling

### kNN

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
ran_ovr_samp = RandomOverSampler(random_state = 4)
X_newsample, y_newsample = ran_ovr_samp.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_newsample, y_newsample, random_state=2)
y_pred = kNN.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Decision Tree

In [None]:
ran_ovr_samp = RandomOverSampler(random_state = 4)
X_newsample, y_newsample = ran_ovr_samp.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_newsample, y_newsample, random_state=1)
y_pred = DT.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Gaussian Naive Bayes

In [None]:
ran_ovr_samp = RandomOverSampler(random_state = 2)
X_newsample, y_newsample = ran_ovr_samp.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_newsample, y_newsample, random_state=9)
y_pred = gnb.fit(X_train, y_train).predict(X_test)
#print(X_train.shape,X_test.shape)

print("Malignant in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted malignant : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### Logistic Regression

In [None]:
ran_ovr_samp = RandomOverSampler(random_state = 3)
X_newsample, y_newsample = ran_ovr_samp.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_newsample, y_newsample, random_state=0)
logReg=LogisticRegression()
y_pred = logReg.fit(X_train, y_train).predict(X_test)

print("Disease treated successfully in test set : %0.2f" % (1- (y_test.sum()/len(y_test))))
print("Predicted treated disease : %0.2f" % (1- (y_pred.sum()/len(y_pred))))

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=['Disease Treated','Disease Not Treated'], normalize=False,
                      title='Normalized confusion matrix')

### FP-Rate for each algorithms

#### kNN,Decison Tree,Gaussian naive Bayes,Logistic Regression
#### 0.3243,0.324,0.486,0.21 : Hand-Out	
#### 0.28,024,0.36,0.16: Oversampling

	


#### Conclusion

#### As the dataset was imbalanced the intial hand-out algorithms were biased. After applying oversampling it can be seen above that the FP rate decreases for each classification alorithm.  