# Algorithmic Bias - Assignment 01
### Student name - Atul Kumar Singh (20200619)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import KMeansSMOTE
from collections import Counter


surv = pd.read_csv('survival.csv')
surv['Survived'] = 'GE5'
surv.loc[surv['Class']==2,'Survived']='L5'

In [None]:
surv['Survived'].value_counts() 

In [None]:
vc=surv['Survived'].value_counts() 
y = surv.pop('Survived').values
surv.pop('Class')
X = surv.values
X.shape, y.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

### Model creation

In [None]:
model_all = {}

model_all['kNN']= KNeighborsClassifier(n_neighbors=3)
model_all['dtree'] = DecisionTreeClassifier(max_depth=2,criterion='entropy')
model_all['logistic'] = LogisticRegression(random_state=42,max_iter=10000)
model_all['gradient']= GradientBoostingClassifier(random_state=42)

### Hold out testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.5, random_state=42)
acc_bal = {}
predictedMinority={}

print("shape of training and test samples:")
print(X_train.shape,X_test.shape)
print("Total count of Minority class L5 in test set: {}".format(len(y_test)-Counter(y_test)['GE5']))
for m in model_all:
    y_pred = model_all[m].fit(X_train, y_train).predict(X_test)
    acc_bal[m] = accuracy_score(y_test, y_pred)
    predictedMinority[m] = len(y_pred)-(Counter(y_pred)['GE5'])
    print("Result of {:22}, predicted minority {:d}, accuracy {:.2f}".format(type(model_all[m]).__name__,predictedMinority[m],acc_bal[m]))


%matplotlib inline 

# Add the prior figures to the data for plotting
objects = ['Prior'] + list(predictedMinority.keys())
positive = [len(y_test)-Counter(y_test)['GE5']] + list(predictedMinority.values())

y_pos = np.arange(len(objects))
print(len(y_test)-Counter(y_test)['GE5'])

plt.bar(y_pos, positive, align='center', color=['red', 'blue', 'blue','blue','blue'],alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Minority Count')
plt.title('ML Algorithm Bias')

plt.show()




* In the hold out testing we observed that all algorithms are biased towards majority class.
- KNN has bias of about 60 %
- Decision tree has bias of about 36%
- Logistic Regression and decision tree has very high bias.
- Gradient Boosting has bias of 19 % 

### Cross Validation to check bias in the models

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn),
       'fp' : make_scorer(fp), 'fn' : make_scorer(fn)}


folds = 10
v = 0 

bias_cv = {}

for m in model_all:
    cv_results = cross_validate(model_all[m], X, y, cv= folds,scoring=scoring, return_train_score=False, 
                                verbose = v, n_jobs = -1)
    n_total = cv_results['test_tp'].sum() + cv_results['test_fp'].sum()
    accuracy = (cv_results['test_tp'].sum() + cv_results['test_tn'].sum())/len(y)
    bias_cv[m] = n_total

    print("{} x CV {:22} N: {:d} Pred N: {:d} Acc: {:.2f}".format(folds, type(model_all[m]).__name__, 
                                                              vc[1],n_total,accuracy))

%matplotlib inline 

objects = ['Prior'] + list(bias_cv.keys())
positive = [vc[1]] + list(bias_cv.values())

y_pos = np.arange(len(objects))

plt.bar(y_pos, positive, align='center', color=['red', 'blue', 'blue','blue','blue','blue'],alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Minority Count')
plt.title('ML Algorithm Bias - Cross Validation')

plt.show()


* on cross validating the models using k=10, it is observed that almost all models except gradient boosting where bias is very low.
* In CV , the entire dataset in different folds is considered to verify the efficacy of the model. Taking a higher value of K reduces the probability of bias.

### Let's do up sampling using KMeans SMOTE

In [None]:

# splitting data into training and testing pairs
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.5, random_state=42)

print("Before upsampling training set {}".format(Counter(y_train)))

smk=KMeansSMOTE(sampling_strategy=0.7,random_state=42)
X_trainUP,y_trainUP=smk.fit_resample(X_train,y_train)
print("After sampling training set {}".format(Counter(y_trainUP)))

acc_bal = {}
predictedMinority={}

print("Total count of Minority class L5 in test set: {}".format(len(y_test)-Counter(y_test)['GE5']))
for m in model_all:
    y_pred = model_all[m].fit(X_trainUP, y_trainUP).predict(X_test)
    acc_bal[m] = accuracy_score(y_test, y_pred)
    predictedMinority[m] = len(y_pred)-(Counter(y_pred)['GE5'])
    print("Result of {:22}, predicted minority {:d}, accuracy {:.2f}".format(type(model_all[m]).__name__,predictedMinority[m],acc_bal[m]))


%matplotlib inline 

# Add the prior figures to the data for plotting
objects = ['Prior'] + list(predictedMinority.keys())
positive = [len(y_test)-Counter(y_test)['GE5']] + list(predictedMinority.values())

y_pos = np.arange(len(objects))
print(len(y_test)-Counter(y_test)['GE5'])

plt.bar(y_pos, positive, align='center', color=['red', 'blue', 'blue','blue','blue'],alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Minority Count')
plt.title('Upsampling result')

plt.show()
    
    

### Discussion on upsampling
* To reduce the bias, I have used KMeansSMOTE() method with sampling strategy =70 % Here, minority class is about 70 percent of the majority count. The KMeansSMOTE avoids generation of noise during upsampling and effectively overcomes imbalances between and within classes.
* The solution works pretty well in all the models.
* Bias is reduced in all the models. Accuracy of KNN,Logistic Regression has minimal impact and accuracy of Decision Tree increases by ~3%. Accuracy of Gradient boosting remains unchanged.
* KNN has the best balance between bias and accuracy.

In [None]:
hotel = pd.read_csv('HotelRev.csv')
hotel.head(5)

In [None]:
hotel['reviewHelpfulness'].value_counts()

In [None]:
y=hotel.pop('reviewHelpfulness').values
X=hotel.values

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X=scaler.fit_transform(X)

In [None]:
print("Original Dataset STATS")
print("Minority class:",len(y) - y.sum())
print("Majority class:",y.sum())
print("Minority class: {:.2f}%".format((len(y)-y.sum())/len(y)*100))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=.5)
predictedMinority = {}
acc_bal = {}

print("Minority class in test set : %d" % (len(y_test) - y_test.sum()))

for m in model_all:
    y_pred = model_all[m].fit(X_train, y_train).predict(X_test)
    pred_count = (len(y_pred) - y_pred.sum())
    predictedMinority[m] = pred_count
    acc = accuracy_score(y_test, y_pred)
    acc_bal[m] = acc
  

    print("{:22} Pred. Unhelpful: {:d} Accuracy: {:.2f}".
          format(type(model_all[m]).__name__, pred_count,acc))
    
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline 

# Add the prior figures to the data for plotting
objects = ['Prior'] + list(predictedMinority.keys())
positive = [len(y_test) - y_test.sum()] + list(predictedMinority.values())

y_pos = np.arange(len(objects))

plt.bar(y_pos, positive, align='center', color=['red', 'blue', 'blue','blue','blue'],alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Minority Count')
plt.title('ML Algorithm Bias')
 
plt.show()

In [None]:

# splitting data into training and testing pairs
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.5, random_state=42)

print("Before upsampling training set {}".format(Counter(y_train)))

#implementing oversampling with undersampling to remove tomek links 
#smk=SMOTETomek(sampling_strategy=0.7,random_state=1)
smk=KMeansSMOTE(sampling_strategy=0.7,random_state=42)
X_trainUP,y_trainUP=smk.fit_resample(X_train,y_train)
print("After sampling training set {}".format(Counter(y_trainUP)))

acc_bal = {}
predictedMinority={}

print("Total count of Minority class in test set: {}".format(len(y_test) - y_test.sum()))
for m in model_all:
    y_pred = model_all[m].fit(X_trainUP, y_trainUP).predict(X_test)
    acc_bal[m] = accuracy_score(y_test, y_pred)
    predictedMinority[m] = len(y_pred)- y_pred.sum()
    print("Result of {:22}, predicted minority {:d}, accuracy {:.2f}".format(type(model_all[m]).__name__,predictedMinority[m],acc_bal[m]))


%matplotlib inline 

# Add the prior figures to the data for plotting
objects = ['Prior'] + list(predictedMinority.keys())
positive = [len(y_test) - y_test.sum()] + list(predictedMinority.values())

y_pos = np.arange(len(objects))

plt.bar(y_pos, positive, align='center', color=['red', 'blue', 'blue','blue','blue'],alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Minority Count')
plt.title('Up-sampling result')
 
plt.show()

### Discussion on the output of 2nd dataset
* Applying KMeans SMOTE here also gives positive results.
* Bias is reduced in all models.
* Also, accuracy of Decision Tree, Logistic Regression, Gradient Boosting increases by a good percentage.
* Accuracy impact on KNN in minimal.
* Gradient boosting has the best balance between accuracy and bias.