First Part

## Logistic Classifier

In [525]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import math
def sigmoid(scores): 
    return 1 / (1 + np.exp(-scores))
def logistic_regression(features, target, num_steps, learning_rate, add_one=False):
    if add_one: 
        ones = np.ones((features.shape[0], 1))  
        features = np.hstack((ones, features))     
    weights = np.zeros(features.shape[1])
  
    for step in range(num_steps):  
        scores = np.dot(features, weights) 
        predictions = sigmoid(scores)  
        output_error_signal = target - predictions      
        gradient = np.dot(features.T, output_error_signal)    
        weights += learning_rate * gradient  
    return weights

def metric_evaluation(predicted, real):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    for p,g in zip(predicted, real):
        if p==1 and g==1:
            true_pos+=1
        if p==0 and g==0:
            true_neg+=1
        if p==1 and g==0:
            false_pos+=1
        if p==0 and g==1:
            false_neg+=1
    try:
        accuracy = (true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg)
    except:
        accuracy = 0
    try:
        recall = (true_pos)/(true_pos+false_neg)
    except:
        recall = 0
    try:
        precision = (true_pos)/(true_pos+false_pos)
    except:
        precision = 0
    try:
        f1score = 2*(precision*recall)/(precision+recall)
    except:
        f1score = 0
    return accuracy, recall, f1score

def predict_test_data(w, X):
    m = X.shape[0]
    ones = np.ones((X.shape[0], 1)) 
    features = np.hstack((ones, X)) 
    A = sigmoid(np.dot(features, w))
    return A

In [526]:
dataset = pd.read_csv("transfusion.csv").dropna()
dataset = (dataset-dataset.min())/(dataset.max()-dataset.min())
x = pd.get_dummies(dataset.drop('whether he/she donated blood in March 2007', axis=1))
xx = x.values
y = np.asarray(dataset['whether he/she donated blood in March 2007'])
train_y = y[0:640]
train_x = xx[0:640]
test_y = y[640:]
test_x = xx[640:]
# print(test_y)
weights = logistic_regression(train_x, train_y, 100, 0.001, True)
print(weights)
predictedA = predict_test_data(weights, test_x)
Y_prediction = np.zeros((test_x.shape[0]))
for i in range(predictedA.shape[0]):
    #مقدار ترشلد را برابر با 0.25 در نظر گرفتم.
    Y_prediction[i] = 1 if predictedA[i] >= 0.25 else 0
accuracy, recall, f1score = metric_evaluation(Y_prediction, test_y)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

[-0.92720711 -0.76860971  0.61457069  0.61457069 -0.42064347]
Accuracy: 0.7962962962962963
Recall: 0.2222222222222222
F1score: 0.15384615384615383


## Decision Tree

In [501]:
sag = 0
count = 0
class Node:
    def __init__(self):
        self.right = None
        self.left = None
        self.column = None
        self.threshold = None
        self.probas = None
        self.depth = None
        self.is_terminal = False
class DecisionTreeClassifier:
    def __init__(self, max_depth = 5, min_samples_leaf = 1, min_samples_split = 2):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.classes = None
        self.Tree = None
    def nodeProbas(self, y):
        probas = []
        for one_class in self.classes:
            proba = y[y == one_class].shape[0] / y.shape[0]
            probas.append(proba)
        return np.asarray(probas)
    def gini(self, probas):
        return 1 - np.sum(probas**2)
    def calcImpurity(self, y):
        return self.gini(self.nodeProbas(y))
    def calcBestSplit(self, X, y):
        bestSplitCol = None
        bestThresh = None
        bestInfoGain = -999
        impurityBefore = self.calcImpurity(y)
        for col in range(X.shape[1]):
            x_col = X[:, col]
            for x_i in x_col:
                threshold = x_i
                y_right = y[x_col > threshold]
                y_left = y[x_col <= threshold]
                if y_right.shape[0] == 0 or y_left.shape[0] == 0:
                    continue
                impurityRight = self.calcImpurity(y_right)
                impurityLeft = self.calcImpurity(y_left)
                infoGain = impurityBefore
                infoGain -= (impurityLeft * y_left.shape[0] / y.shape[0]) + (impurityRight * y_right.shape[0] / y.shape[0])
                if infoGain > bestInfoGain:
                    bestSplitCol = col
                    bestThresh = threshold
                    bestInfoGain = infoGain
        if bestInfoGain == -999:
            return None, None, None, None, None, None
        x_col = X[:, bestSplitCol]
        x_left, x_right = X[x_col <= bestThresh, :], X[x_col > bestThresh, :]
        y_left, y_right = y[x_col <= bestThresh], y[x_col > bestThresh]
        return bestSplitCol, bestThresh, x_left, y_left, x_right, y_right
    def buildDT(self, X, y, node):
        if node.depth >= self.max_depth:
            node.is_terminal = True
            return
        if X.shape[0] < self.min_samples_split:
            node.is_terminal = True
            return
        if np.unique(y).shape[0] == 1:
            node.is_terminal = True
            return
        splitCol, thresh, x_left, y_left, x_right, y_right = self.calcBestSplit(X, y)
        if (splitCol is None):
            node.is_terminal = True
        if (x_left is None) or x_left.shape[0] < self.min_samples_leaf or x_right.shape[0] < self.min_samples_leaf:
            node.is_terminal = True
            return
        node.column = splitCol
        node.threshold = thresh
        node.left = Node()
        node.left.depth = node.depth + 1
        node.left.probas = self.nodeProbas(y_left)
        node.right = Node()
        node.right.depth = node.depth + 1
        node.right.probas = self.nodeProbas(y_right)
        self.buildDT(x_right, y_right, node.right)
        self.buildDT(x_left, y_left, node.left)
    def fit(self, X, y):
        if type(X) == pd.DataFrame:
            X = np.asarray(X)
        self.classes = np.unique(y)
        self.Tree = Node()
        self.Tree.depth = 1
        self.Tree.probas = self.nodeProbas(y)
        self.buildDT(X, y, self.Tree)
    def predictSample(self, x, node):
        if node.is_terminal:
            return node.probas
        if x[node.column] > node.threshold:
            probas = self.predictSample(x, node.right)
        else:
            probas = self.predictSample(x, node.left)
        return probas
    def predict(self, X):
        if type(X) == pd.DataFrame:
            X = np.asarray(X)
        predictions = []
        for x in X:
            pred = np.argmax(self.predictSample(x, self.Tree))
            predictions.append(pred)
        return np.asarray(predictions)

In [502]:
dt = DecisionTreeClassifier(max_depth=10)
dt.fit(train_x, train_y)
prediction = dt.predict(test_x)
accuracy, recall, f1score = metric_evaluation(prediction, test_y)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

Accuracy: 0.8425925925925926
Recall: 0.1111111111111111
F1score: 0.10526315789473685


Second Part




مشکل اصلی این است که داده های ما بالانس نیستند. درواقع تعداد داده ها با لیبل 1 خیلی کم تر از تعداد داده ها با لیبل 0 است. به همین دلیل ویژگیهای داده های کلاس 1 به خوبی توسط مدل یادگرفته نمیشوند و نیاز است که به گونه این مشکل برطرف شود.
میتوان تعداد داده های دو کلاس را با هم تقریبا یکسان کرد.
تعداد داده های کلاس صفر حدودا 3 برابر تعداد داده های کلاس یک است.
اگر داده های کلاس یک را سه بار در دیتاست تکرار کنیم تعداد داده های دو کلاس تقریبا مشابه میشود و دیگر دیتاست ما غیربالانس نخواهد بود. در این صورت مدل یاد میگیرد که اهمیت هر دوی این کلاس ها یکسان هستند.
_______________
برای لجیستیک به طور خاص میشود از اسکن کردن ترشلد استفاده کرد و سعی کرد با حرکت در یک بازه ای، ترشلد را جایی انتخاب کرد که اف اسکور بیشینه شود.

Third Part

In [462]:
one_indexes = []
dataset = dataset.rename(columns = {"whether he/she donated blood in March 2007":"label"})
for i in range(0, len(dataset["label"])):
    if dataset["label"][i]==1:
        one_indexes.append(i)
proportion = dataset.query('label == 0').label.count()/dataset.query('label == 1').label.count()
datasett = dataset.iloc[one_indexes]
for i in range(int(proportion) - 1):
    datasett = pd.concat([datasett, dataset])
test_index = datasett.index.isin(np.random.randint(datasett.shape[0], size=int((len(test_x)/len(dataset))*len(datasett))))
test_set = datasett.iloc[test_index]
train_set = datasett.iloc[~test_index]
x_test = test_set.drop(['label'], axis=1)
y_test = test_set.iloc[:, -1]
x_train = train_set.drop(['label'], axis=1)
y_train = train_set.iloc[:, -1]
dt1 = DecisionTreeClassifier(max_depth=10)
dt1.fit(x_train, y_train)
#نتایج برای دیتاست جدید
prediction = dt.predict(x_test)
accuracy, recall, f1score = metric_evaluation(prediction, y_test)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

54 126 16 39
Accuracy: 0.7659574468085106
Recall: 0.5806451612903226
F1score: 0.6625766871165644


In [479]:
#نتایج برای دیتاست قبلی
prediction = dt1.predict(test_x)
accuracy, recall, f1score = metric_evaluation(prediction, test_y)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

Accuracy: 0.9074074074074074
Recall: 0.4444444444444444
F1score: 0.4444444444444444


نتیجه ی اجرای لجیستیک برا روی دیتاست بالانس.
به همراه استفاده از اسکن ترشلد.

In [527]:
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [528]:
#دیتاست جدید
weights = logistic_regression(x_train, y_train, 100, 0.001, True)
print(weights)
predictedA = predict_test_data(weights, test_x)
Y_prediction = np.zeros((x_test.shape[0]))
for threshold in np.arange(0,1,0.01):
    for i in range(predictedA.shape[0]):
        Y_prediction[i] = 1 if predictedA[i] >= threshold else 0
    accuracy, recall, f1score = metric_evaluation(Y_prediction, test_y)
    if f1score > best_f1score:
        best_f1score = f1score
        best_threshold = threshold
        best_recall = recall
        best_accuracy = accuracy
print("Accuracy: " + str(best_accuracy))
print("Recall: " + str(best_recall))
print("F1score: " + str(best_f1score))

[-56.93499756 -11.40554724  -1.69906551  -1.69906551 -19.40729811]
Accuracy: 0.6978723404255319
Recall: 0.967741935483871
F1score: 0.7171314741035858


Forth Part

In [503]:
from __future__ import division, print_function
from sklearn import datasets
import matplotlib.pyplot as plt

class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_index = None
        self.threshold = None
        self.alpha = None

class Adaboost():
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
    def fit(self, X, y):
        n_samples, n_features = np.shape(X)
        w = np.full(n_samples, (1 / n_samples))
        self.clfs = []
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float('inf')
            for feature_i in range(n_features):
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)
                for threshold in unique_values:
                    p = 1
                    prediction = np.ones(np.shape(y))
                    prediction[X[:, feature_i] < threshold] = -1
                    error = sum(w[y != prediction])
                    if error > 0.5:
                        error = 1 - error
                        p = -1
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_index = feature_i
                        min_error = error
            clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
            predictions = np.ones(np.shape(y))
            negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
            predictions[negative_idx] = -1
            w *= np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w)
            self.clfs.append(clf)
    def predict(self, X):
        n_samples = np.shape(X)[0]
        y_pred = np.zeros((n_samples, 1))
        for clf in self.clfs:
            predictions = np.ones(np.shape(y_pred))
            negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
            predictions[negative_idx] = -1
            y_pred += clf.alpha * predictions
        y_pred = np.sign(y_pred).flatten()
        return y_pred

In [513]:
ab = AdaBoost()
for i in range(len(train_y)):
    if(train_y[i] == 0):
        train_y[i] = -1
# train_y
clf = Adaboost(n_clf=5)
clf.fit(train_x, train_y)
prediction = clf.predict(test_x)
for i in range(len(prediction)):
    if(prediction[i] == -1):
        prediction[i] = 0
# y_pred
accuracy, recall, f1score = metric_evaluation(prediction, test_y)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

Accuracy: 0.9074074074074074
Recall: 0.0
F1score: 0


using our solution from part two

In [516]:
ab1 = AdaBoost()
for i in range(len(y_train)):
    if(y_train[i] == 0):
        y_train[i] = -1
# y_train
clf1 = Adaboost(n_clf=5)
clf1.fit(x_train, y_train)
prediction = clf.predict(x_test)
for i in range(len(prediction)):
    if(prediction[i] == -1):
        prediction[i] = 0
# y_pred
accuracy, recall, f1score = metric_evaluation(prediction, y_test)
print("Accuracy: " + str(accuracy))
print("Recall: " + str(recall))
print("F1score: " + str(f1score))

Accuracy: 0.676595744680851
Recall: 0.22580645161290322
F1score: 0.3559322033898305
