In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
  
# Read the CSV file.
data = pd.read_csv("CTG.csv", skiprows=1)

# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data = data[selected_cols].dropna()

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)

# Map the diagnosis code to a human-readable label.
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]

Y = data_shuffled['NSP'].apply(to_label)

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=69)


# look at the data 
X.head()

In [12]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
#lets create a dummy classifier
clf = DummyClassifier(strategy='most_frequent')
# test the classifier
cross_val_score(clf, Xtrain, Ytrain)

In [20]:
#import a bunch of other classifiers
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
MAX_ITER = 100
# create a dictionary of classifiers
clfs = {
    'Decision Tree at 10': DecisionTreeClassifier(max_depth=10),
     'Decision Tree at 5': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(random_state=69),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Neural Network at (100,50)': MLPClassifier(hidden_layer_sizes=(100, 50)),
    'Neural Network at (10,500)': MLPClassifier(hidden_layer_sizes=(10, 500)),

}
# loop through the classifiers and test them
for name, clf in clfs.items():
    score= cross_val_score(clf, Xtrain, Ytrain)
    print("----------------------------------")
    print(f'{name}: {score.mean()}')
    print("----------------------------------")

print("done")
print("picked gradient boosting")
from sklearn.metrics import accuracy_score
clf = GradientBoostingClassifier()
clf.fit(Xtrain, Ytrain)
Yguess = clf.predict(Xtest)
print("accuracy score")
print(accuracy_score(Ytest, Yguess))



----------------------------------
Decision Tree at 10: 0.9252941176470589
----------------------------------
----------------------------------
Decision Tree at 5: 0.9282352941176472
----------------------------------
----------------------------------
Random Forest: 0.9394117647058824
----------------------------------
----------------------------------
Gradient Boosting: 0.9523529411764706
----------------------------------
----------------------------------
Neural Network at (100,50): 0.8647058823529413
----------------------------------
----------------------------------
Neural Network at (10,500): 0.8452941176470589
----------------------------------
done
picked gradient boosting
accuracy score
0.9553990610328639


# Choice of Classifier
After testing all of the classifiers we landed on using the Gradient Boosting classifier since that one remained on top even though we tried tuning the hyperparameters of the desision tree and neural network (here we ran out of memory). 

- Descision tree at a max depth of 5: 0.9288235294117648
- Descision tree at a max depth of 10:  0.9252941176470589
- Gradient Boosting: 0.9523529411764706
- Random Forest: 0.9394117647058824
- Neural Network at (100,50):  0.8694117647058824
- Neural Network at (10,500): 0.8552941176470588

## Gradient Boosting Description
The gradient boositng algorithm uses a number of descision trees or other similar algorithms which correct the mistakes (or at least makes an attempt to) of the previous model. Each model is trained to minimize the gradient (a measure of the error) of the loss function. The final model is a combination of all the models.
 



In [16]:
## import the tree calssifier
from collections import Counter
from graphviz import Digraph
from sklearn.base import BaseEstimator, ClassifierMixin
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


def majority_sum_scorer(n_low, low_distr, n_high, high_distr):
    maj_sum_low = low_distr.most_common(1)[0][1]
    maj_sum_high = high_distr.most_common(1)[0][1]
    return maj_sum_low + maj_sum_high
    
def entropy(distr):
    n = sum(distr.values())
    ps = [n_i/n for n_i in distr.values()]
    return -sum(p*np.log2(p) if p > 0 else 0 for p in ps)

def info_gain_scorer(n_low, low_distr, n_high, high_distr):
    return -(n_low*entropy(low_distr)+n_high*entropy(high_distr))/(n_low+n_high)

def gini_impurity(distr):
    n = sum(distr.values())
    ps = [n_i/n for n_i in distr.values()]
    return 1-sum(p**2 for p in ps)
    
def gini_scorer(n_low, low_distr, n_high, high_distr):
    return -(n_low*gini_impurity(low_distr)+n_high*gini_impurity(high_distr))/(n_low+n_high)


class DecisionTreeLeaf:

    def __init__(self, value):
        self.value = value

    # This method computes the prediction for this leaf node. This will just return a constant value.
    def predict(self, x):
        return self.value

    # Utility function to draw a tree visually using graphviz.
    def draw_tree(self, graph, node_counter, names):
        node_id = str(node_counter)
        val_str = f'{self.value:.4g}' if isinstance(self.value, float) else str(self.value)
        graph.node(node_id, val_str, style='filled')
        return node_counter+1, node_id
        
    def __eq__(self, other):
        if isinstance(other, DecisionTreeLeaf):
            return self.value == other.value
        else:
            return False
class DecisionTreeBranch:

    def __init__(self, feature, threshold, low_subtree, high_subtree):
        self.feature = feature
        self.threshold = threshold
        self.low_subtree = low_subtree
        self.high_subtree = high_subtree

    # For a branch node, we compute the prediction by first considering the feature, and then 
    # calling the upper or lower subtree, depending on whether the feature is or isn't greater
    # than the threshold.
    def predict(self, x):
        if x[self.feature] <= self.threshold:
            return self.low_subtree.predict(x)
        else:
            return self.high_subtree.predict(x)

    # Utility function to draw a tree visually using graphviz.
    def draw_tree(self, graph, node_counter, names):
        node_counter, low_id = self.low_subtree.draw_tree(graph, node_counter, names)
        node_counter, high_id = self.high_subtree.draw_tree(graph, node_counter, names)
        node_id = str(node_counter)
        fname = f'F{self.feature}' if names is None else names[self.feature]
        lbl = f'{fname} > {self.threshold:.4g}?'
        graph.node(node_id, lbl, shape='box', fillcolor='yellow', style='filled, rounded')
        graph.edge(node_id, low_id, 'False')
        graph.edge(node_id, high_id, 'True')
        return node_counter+1, node_id
        
class DecisionTree(ABC, BaseEstimator):

    def __init__(self, max_depth):
        super().__init__()
        self.max_depth = max_depth
        
    # As usual in scikit-learn, the training method is called *fit*. We first process the dataset so that
    # we're sure that it's represented as a NumPy matrix. Then we call the recursive tree-building method
    # called make_tree (see below).
    def fit(self, X, Y):
        if isinstance(X, pd.DataFrame):
            self.names = X.columns
            X = X.to_numpy()
        elif isinstance(X, list):
            self.names = None
            X = np.array(X)
        else:
            self.names = None
        Y = np.array(Y)        
        self.root = self.make_tree(X, Y, self.max_depth)
        
    def draw_tree(self):
        graph = Digraph()
        self.root.draw_tree(graph, 0, self.names)
        return graph
    
    # By scikit-learn convention, the method *predict* computes the classification or regression output
    # for a set of instances.
    # To implement it, we call a separate method that carries out the prediction for one instance.
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        return [self.predict_one(x) for x in X]

    # Predicting the output for one instance.
    def predict_one(self, x):
        return self.root.predict(x)        

    # This is the recursive training 
    def make_tree(self, X, Y, max_depth):

        # We start by computing the default value that will be used if we'll return a leaf node.
        # For classifiers, this will be the most common value in Y.
        default_value = self.get_default_value(Y)

        # First the two base cases in the recursion: is the training set completely
        # homogeneous, or have we reached the maximum depth? Then we need to return a leaf.

        # If we have reached the maximum depth, return a leaf with the majority value.
        if max_depth == 0:
            return DecisionTreeLeaf(default_value)

        # If all the instances in the remaining training set have the same output value,
        # return a leaf with this value.
        if self.is_homogeneous(Y):
            return DecisionTreeLeaf(default_value)

        # Select the "most useful" feature and split threshold. To rank the "usefulness" of features,
        # we use one of the classification or regression criteria.
        # For each feature, we call best_split (defined in a subclass). We then maximize over the features.
        n_features = X.shape[1]
        _, best_feature, best_threshold = max(self.best_split(X, Y, feature) for feature in range(n_features))
        
        if best_feature is None:
            return DecisionTreeLeaf(default_value)

        # Split the training set into subgroups, based on whether the selected feature is greater than
        # the threshold or not
        X_low, X_high, Y_low, Y_high = self.split_by_feature(X, Y, best_feature, best_threshold)

        # Build the subtrees using a recursive call. Each subtree is associated
        # with a value of the feature.
        low_subtree = self.make_tree(X_low, Y_low, max_depth-1)
        high_subtree = self.make_tree(X_high, Y_high, max_depth-1)

        if low_subtree == high_subtree:
            return low_subtree

        # Return a decision tree branch containing the result.
        return DecisionTreeBranch(best_feature, best_threshold, low_subtree, high_subtree)
    
    # Utility method that splits the data into the "upper" and "lower" part, based on a feature
    # and a threshold.
    def split_by_feature(self, X, Y, feature, threshold):
        low = X[:,feature] <= threshold
        high = ~low
        return X[low], X[high], Y[low], Y[high]
    
    # The following three methods need to be implemented by the classification and regression subclasses.
    
    @abstractmethod
    def get_default_value(self, Y):
        pass

    @abstractmethod
    def is_homogeneous(self, Y):
        pass

    @abstractmethod
    def best_split(self, X, Y, feature):
        pass


class TreeClassifier(DecisionTree, ClassifierMixin):

    def __init__(self, max_depth=10, criterion='maj_sum'):
        super().__init__(max_depth)
        self.criterion = criterion
        
    def fit(self, X, Y):
        # For decision tree classifiers, there are some different ways to measure
        # the homogeneity of subsets.
        if self.criterion == 'maj_sum':
            self.criterion_function = majority_sum_scorer
        elif self.criterion == 'info_gain':
            self.criterion_function = info_gain_scorer
        elif self.criterion == 'gini':
            self.criterion_function = gini_scorer
        else:
            raise Exception(f'Unknown criterion: {self.criterion}')
        super().fit(X, Y)
        self.classes_ = sorted(set(Y))

    # Select a default value that is going to be used if we decide to make a leaf.
    # We will select the most common value.
    def get_default_value(self, Y):
        self.class_distribution = Counter(Y)
        return self.class_distribution.most_common(1)[0][0]
    
    # Checks whether a set of output values is homogeneous. In the classification case, 
    # this means that all output values are identical.
    # We assume that we called get_default_value just before, so that we can access
    # the class_distribution attribute. If the class distribution contains just one item,
    # this means that the set is homogeneous.
    def is_homogeneous(self, Y):
        return len(self.class_distribution) == 1
        
    # Finds the best splitting point for a given feature. We'll keep frequency tables (Counters)
    # for the upper and lower parts, and then compute the impurity criterion using these tables.
    # In the end, we return a triple consisting of
    # - the best score we found, according to the criterion we're using
    # - the id of the feature
    # - the threshold for the best split
    def best_split(self, X, Y, feature):

        # Create a list of input-output pairs, where we have sorted
        # in ascending order by the input feature we're considering.
        sorted_indices = np.argsort(X[:, feature])        
        X_sorted = list(X[sorted_indices, feature])
        Y_sorted = list(Y[sorted_indices])

        n = len(Y)

        # The frequency tables corresponding to the parts *before and including*
        # and *after* the current element.
        low_distr = Counter()
        high_distr = Counter(Y)

        # Keep track of the best result we've seen so far.
        max_score = -np.inf
        max_i = None

        # Go through all the positions (excluding the last position).
        for i in range(0, n-1):

            # Input and output at the current position.
            x_i = X_sorted[i]
            y_i = Y_sorted[i]
            
            # Update the frequency tables.
            low_distr[y_i] += 1
            high_distr[y_i] -= 1

            # If the input is equal to the input at the next position, we will
            # not consider a split here.
            #x_next = XY[i+1][0]
            x_next = X_sorted[i+1]
            if x_i == x_next:
                continue

            # Compute the homogeneity criterion for a split at this position.
            score = self.criterion_function(i+1, low_distr, n-i-1, high_distr)

            # If this is the best split, remember it.
            if score > max_score:
                max_score = score
                max_i = i

        # If we didn't find any split (meaning that all inputs are identical), return
        # a dummy value.
        if max_i is None:
            return -np.inf, None, None

        # Otherwise, return the best split we found and its score.
        split_point = 0.5*(X_sorted[max_i] + X_sorted[max_i+1])
        return max_score, feature, split_point

In [None]:
## Tune the hyperparameter max_depth:)

optimal_depth = 0
otimal_score = 0
clf = TreeClassifier(max_depth=0)
for depth in range(1, 10):
    clf.max_depth = depth
    score = cross_val_score(clf, Xtrain, Ytrain)
    if score.mean() > otimal_score:
        otimal_score = score.mean()
        optimal_depth = depth

print(f'Optimal depth: {optimal_depth}')
print(f'Optimal score: {otimal_score}')


Optimal depth: 5
Optimal score: 0.9064705882352943


In [21]:
from sklearn.metrics import accuracy_score
clf = TreeClassifier(max_depth=5)
clf.fit(Xtrain, Ytrain)
Yguess = clf.predict(Xtest)
print("accuracy score at 5")
print(accuracy_score(Ytest, Yguess))

accuracy score at 5
0.9225352112676056


# Optimal Hyperparameters
- depth = 5
- score = 0.9064705882352943
- accuracy = 0.9225352112676056

# Assignment 3


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Read the Excel file using Pandas.
alldata = pd.read_excel('Hemnet_data.xlsx')

# # Convert the timestamp string to an integer representing the year.
alldata['year'] = pd.DatetimeIndex(alldata['Sold Date']).year

# Convert 'yes' to 1 and 'no' to 0
alldata['Balcony'] = alldata['Balcony'].map({'Yes': 1, 'No': 0})
alldata['Patio'] = alldata['Patio'].map({'Yes': 1, 'No': 0})
alldata['Lift'] = alldata['Lift'].map({'Yes': 1, 'No': 0})

# Select the 12 input columns and the output column.
selected_columns = ['Final Price (kr)', 'year',  'Num of Room', 'Living Area (m²)', 'Balcony', 'Patio','Current Floor', 'Total Floor', 'Lift', 'Built Year', 'Fee (kr/month)', 'Operating Fee (kr/year)']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)

# Separate the input and output columns.
X = alldata_shuffled.drop('Final Price (kr)', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['Final Price (kr)'].apply(np.log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

In [28]:
## testing numeric values for cross validation
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.00251579, 0.00199914, 0.00201559, 0.00200081, 0.00150084]),
 'score_time': array([0.       , 0.0012784, 0.       , 0.       , 0.       ]),
 'test_score': array([-0.35548711, -0.35827597, -0.31759722, -0.34236524, -0.35596055])}

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
import math


## fix some error
if 'Fee (kr/month)' in Xtrain.columns:
                    Xtrain['Fee (kr/month)'] = Xtrain['Fee (kr/month)'].replace('[^0-9]', '', regex=True).astype(float)
                    Xtest['Fee (kr/month)'] = Xtest['Fee (kr/month)'].replace('[^0-9]', '', regex=True).astype(float)

optimal_depth = 1
optimal_estimator = 0
optimal_layer_size = 0
optimal_alpha = 0
optimal_score = 100
regs = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=69),
    'Random Forest': RandomForestRegressor(random_state=69),
    'Gradient Boosting': GradientBoostingRegressor(random_state=69),
    'Neural Network': MLPRegressor(random_state=69), 
    'Lasso': Lasso(random_state=69),
    'Ridge': Ridge(random_state=69)
}

## find the optimal hyperparams for the regressors
print("Finding optimal hyperparameters")
# Decision Tree
print("Decision Tree")
def update_optimal_depth(optimal_score, depth, score):
    if abs(score['test_score'].mean()) < abs(optimal_score):
        optimal_score = score['test_score'].mean()
        optimal_depth = depth
    return optimal_depth

def update_optimal_estimator(optimal_score, regs, estimator):
    score = cross_validate(regs['Random Forest'], Xtrain, Ytrain, scoring='neg_mean_squared_error')
  
    if abs(score['test_score'].mean()) < abs(optimal_score):
        optimal_score = score['test_score'].mean()
        optimal_estimator = estimator
    return optimal_estimator


for depth in range(1, 10):
    regs['Decision Tree'].max_depth = depth
    score = cross_validate(regs['Decision Tree'], Xtrain, Ytrain, scoring='neg_mean_squared_error')
    optimal_depth = update_optimal_depth(optimal_score, depth, score)
regs['Decision Tree'].max_depth = optimal_depth
print(f'Optimal depth: {optimal_depth}')



print("Random Forest")
optimal_score = 100
optimal_depth = 0
optimal_estimator = 0
# Random Forest
for depth in range(1,10):
    regs['Random Forest'].max_depth = depth
    optimal_depth = update_optimal_depth(optimal_score, depth, score)
print(f'Optimal depth: {optimal_depth}')


optimal_score = 100
for estimator in range(10, 110, 10):
    regs['Random Forest'].n_estimators = estimator
    optimal_estimator = update_optimal_estimator(optimal_score, regs, estimator)

print(f'Optimal estimator: {optimal_estimator}')

regs['Random Forest'].n_estimators = optimal_estimator
regs['Random Forest'].max_depth = optimal_depth



print("Gradient Boosting")
# Gradient Boosting
optimal_score = 100
optimal_depth = 0
optimal_estimator = 0
for estimator in range(10, 100, 10):
    regs['Gradient Boosting'].n_estimators = estimator
    optimal_estimator = update_optimal_estimator(optimal_score, regs, estimator)

optimal_score = 100
for depth in range(1,10):
    regs['Gradient Boosting'].max_depth = depth
    optimal_depth = update_optimal_depth(optimal_score, depth, score)

print(f'Optimal estimator: {optimal_estimator}')
print(f'Optimal depth: {optimal_depth}')
regs['Gradient Boosting'].n_estimators = optimal_estimator
regs['Gradient Boosting'].max_depth = optimal_depth


print ("Lasso and Ridge")
## lasso and ridge
optimal_score = 100
for alpha in np.arange(0.1, 1, 0.1):
    regs['Lasso'].alpha = alpha
    score = cross_validate(regs['Lasso'], Xtrain, Ytrain, scoring='neg_mean_squared_error')
    if abs(score['test_score'].mean()) < abs(optimal_score):
        optimal_score = score['test_score'].mean()
        optimal_alpha = alpha
regs['Lasso'].alpha = optimal_alpha
print (f'Optimal alpha Lasso: {optimal_alpha}')
optimal_score = 100
for alpha in np.arange(0.1, 1, 0.1):
    regs['Ridge'].alpha = alpha
    score = cross_validate(regs['Ridge'], Xtrain, Ytrain, scoring='neg_mean_squared_error')
    if abs(score['test_score'].mean()) < abs(optimal_score):
        optimal_score = score['test_score'].mean()
        optimal_alpha = alpha

regs['Ridge'].alpha = optimal_alpha
print (f'Optimal alpha Ridge: {optimal_alpha}')

print("comparing regressors")
## now lets compare regressors
optimal_score = 100
optimal_reg = None
for name, reg in regs.items():
    score = cross_validate(reg, Xtrain, Ytrain, scoring='neg_mean_squared_error')
    print(f'{name}: {score["test_score"].mean()}')
    if abs(score['test_score'].mean()) < abs(optimal_score):
        optimal_score = score['test_score'].mean()
        optimal_reg = name

print(f'Optimal regressor: {optimal_reg}')
print (f'Optimal score: {optimal_score}')

Finding optimal hyperparameters
Decision Tree
Optimal depth: 9
Random Forest
Optimal depth: 9
Optimal estimator: 100
Gradient Boosting
Optimal estimator: 90
Optimal depth: 9
Lasso and Ridge
Optimal alpha Lasso: 0.1
Optimal alpha Ridge: 0.9
comparing regressors
Linear Regression: -0.22044055517560937
Decision Tree: -0.19552683244067154
Random Forest: -0.15960733607981253
Gradient Boosting: -0.1522798203566916
Neural Network: -20.303110724171958
Lasso: -0.2579509437138855
Ridge: -0.22043976532548956
Optimal regressor: Gradient Boosting
Optimal score: -0.1522798203566916


The optimal Regressor was Gradient Boosting with a max depth of 9 and 90 estimatorss. The score was -0.1522798203566916
We tried:
- Linear Regression: -0.22044055517560937
- Decision Tree: -0.19552683244067154
- Random Forest: -0.15960733607981253
- Gradient Boosting: -0.1522798203566916
- Neural Network: -20.303110724171958
- Lasso: -0.2579509437138855
- Ridge: -0.22043976532548956
- Optimal regressor: Gradient Boosting
- Optimal score: -0.1522798203566916

- Decision Tree  
    - Optimal depth: 9  
- Random Forest  
    - Optimal depth: 9  
    - Optimal estimator: 100  
- Gradient Boosting  
    - Optimal estimator: 90  
    - Optimal depth: 9  
- Lasso and Ridge  
    - Optimal alpha Lasso: 0.1  
    - Optimal alpha Ridge: 0.9  


In [27]:
from sklearn.metrics import mean_squared_error
regr = GradientBoostingRegressor(n_estimators=90, max_depth=9, random_state=69)
regr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, regr.predict(Xtest))

0.13786071377982884

The mse was 0.13786071377982884