In [497]:
import time
import datetime
import random
from sys import maxsize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn import preprocessing, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from IPython.display import display, HTML

# Utilities

In [498]:
"""
Sign of learningRate determines whether to use gradient ascent/descent:
Negative learningRates = descent, positive = ascent
"""
def SGO(x, y, seed, gradientFunc, learningRate, error, maxIter):
    x = np.array(x)
    y = np.array(y)

    # Reshape 1-D arrays to column format
    # if len(x.shape) == 1:
    #     x = x.reshape(-1, 1)

    N = len(x)
    currentError = maxsize
    lastError = 0
    beta = np.array(seed)
#     print('SGO weight shape:', beta.shape)
    i = 0

    while i < maxIter:
        gradient = gradientFunc(beta, x, y)
#         yPredicted = x.dot(beta)
#         sqErrGradient = np.array(np.dot(x.T, (yPredicted - y)) / N)
        beta += learningRate * gradient
#         currentError = np.sum(np.square(y - yPredicted)) / N
        currentError = np.sum(gradient)
#         print(currentError)

        if abs(lastError - currentError) < error:
            break
        lastError = currentError
        i += 1

    return beta

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dSigmoid(y):
    return y * (1.0 - y)

def tanh(x):
    return np.tanh(x)

def dTanh(y):
    return 1 - y * y

# Neural Network

In [592]:
class NeuralNetwork:
    """
    inputNeurons: number of input neurons
    hiddenLayerShape: (number of layers, number of neurons per layer) tuple
    outputNeurons: number of output neurons
    """
    def __init__(self, inputNeurons, hiddenLayerShape, outputNeurons,
                 learnRate=1e-1, rateDecay=1e-2, momentum=0, sigmoid=sigmoid, dSigmoid=dSigmoid):
        if hiddenLayerShape[0] > 1:
            raise ValueError('Not supporting more than 1 hidden layer yet')
            
        self.inputNeurons = inputNeurons + 1
        self.hiddenLayerShape = hiddenLayerShape
        self.outputNeurons = outputNeurons
        self.rateDecay = rateDecay
        self.momentum = momentum
        self.sigmoid = sigmoid
        self.dSigmoid = dSigmoid
        hiddenLayers, hiddenNeurons = hiddenLayerShape
        
        input_range = 1.0 / self.inputNeurons ** (1/2)
        output_range = 1.0 / hiddenNeurons ** (1/2)
        self.w_input = np.random.normal(loc=0, scale=input_range,
                                   size=(self.inputNeurons, hiddenNeurons))
        self.w_hidden = np.random.normal(loc=0, scale=output_range,
                                        size=(hiddenLayers, hiddenNeurons, hiddenNeurons))
        self.w_output = np.random.normal(loc=0, scale=output_range,
                                   size=(hiddenNeurons, self.outputNeurons))
        
        self.a_input = np.ones(self.inputNeurons, dtype=float)
        self.a_hidden = np.ones(shape=(hiddenLayers, hiddenNeurons), dtype=float)
        self.a_output = np.ones(self.outputNeurons, dtype=float)
        
#         print('Input weights:', self.w_input)
#         print('Hidden weights:', self.w_hidden)
#         print('Output weights:', self.w_output)
        
        self.c_input = np.zeros(shape=(self.inputNeurons, hiddenNeurons), dtype=float)
        self.c_hidden = np.zeros(shape=(hiddenLayers, hiddenNeurons, hiddenNeurons), dtype=float)
        self.c_output = np.zeros(shape=(hiddenNeurons, self.outputNeurons), dtype=float)
        
        self.learnRate = learnRate
        self._currentLearnRate = learnRate
    
    def fit(self, x, y, iterations=100):
        stepSize = step = iterations // 10
        xy = list(zip(x, y))
        self._currentLearnRate = self.learnRate
        
        for i in range(iterations):
            random.shuffle(xy)
            error = 0.0
            for xVal, yVal in xy:
                self._feedForward(xVal)
                error = self._backPropagate(yVal)
            if i == step:
                print('Error={}, current learning rate={}'.format(
                        error, self._currentLearnRate))
                step += stepSize
            
#             self._currentLearnRate = self.learnRate / (1 + (i / self.T))
            self._currentLearnRate = self._currentLearnRate * (self._currentLearnRate / \
                                                               (self._currentLearnRate + \
                                                                (self._currentLearnRate * self.rateDecay)))
    
    def predict(self, x):
        predictions = []
        for xVal in x:
            predict = self._feedForward(xVal)
            predict[np.argmax(predict)] = 1
            predict[predict < 1.0] = 0
#             print(predict)
            predictions.append(predict)
        return np.array(predictions, dtype=int)
    
    def score(self, x, y):
        if type(x) != np.ndarray:
            x = np.array(x)
        if type(y) != np.ndarray:
            y = np.array(y)
            
        correct = 0
        predictions = self.predict(x)
        precision, recall, f1, support = precision_recall_fscore_support(
            y, predictions, average='macro')
        
        for prediction, yTruth in zip(predictions, y):
            if np.array_equal(prediction, yTruth):
                correct += 1
                
        accuracy = correct / len(y)
#         print('Predictions:', predictions)
#         print('Truth:', y)
        
        return accuracy, precision, recall, f1
    
    def _feedForward(self, x):
        if type(x) != np.ndarray:
            x = np.array(x)
        
        self.a_input[:-1] = x
        hiddenLayers, hiddenNeurons = self.hiddenLayerShape
        
        if hiddenLayers > 0:
#             self.a_hidden[0] = sigmoid(self.a_input * self.w_input)
            for j in range(hiddenNeurons):
                activation = 0.0
                
                for i in range(self.inputNeurons):
                    activation += self.a_input[i] * self.w_input[i][j]
                self.a_hidden[0][j] = tanh(activation)
        
            for layer in range(1, hiddenLayers):
                for j in range(hiddenNeurons):
                    activation = 0.0

                    for i in range(hiddenNeurons):
                        activation += self.a_hidden[layer][i] * self.w_hidden[layer][i][j]
                    self.a_hidden[layer][j] = tanh(activation)
        
        for k in range(self.outputNeurons):
            activation = 0.0
            
#             self.a_output = sigmoid(np.sum(self.a_hidden[-1] * self.w_output[:][k], axis=0))
            for j in range(hiddenNeurons):
                activation += self.a_hidden[-1][j] * self.w_output[j][k]
            
            self.a_output[k] = sigmoid(activation)
        return self.a_output[:]
    
    """
    Update weights and return the current error
    @param y must be one-hot encoded representation of class
    """
    def _backPropagate(self, y):
        if type(y) != np.ndarray:
            y = np.array(y)
        
        hiddenLayers, hiddenNeurons = self.hiddenLayerShape
        outputDeltas = self.dSigmoid(self.a_output) * (self.a_output - y)
        hiddenDeltas = np.zeros(shape=self.a_hidden.shape, dtype=float)
        
        # Calculate last hidden layer error
        error = np.dot(self.w_output, outputDeltas)
        hiddenDeltas[-1] = dTanh(self.a_hidden[-1]) * error
        
        # Adjust final hidden layer weights
        change = outputDeltas * np.reshape(self.a_hidden[-1], (self.a_hidden[-1].shape[0], 1))
        self.w_output -= self._currentLearnRate * change + self.c_output * self.momentum
        self.c_output = change
        
#         print('C_Output:', self.c_output)

        # TODO: Finish hidden layer error propagation for multiple hidden layers
        for layer in reversed(range(hiddenLayers - 1)):
            print('Backpropping through layer', layer)
            error = np.dot(self.w_hidden[layer + 1], hiddenDeltas[layer + 1])
            hiddenDeltas[layer] = dTanh(self.a_hidden[layer + 1]) * error
            change = hiddenDeltas[layer + 1] * np.reshape(self.a_hidden[layer], (self.a_hidden[layer].shape[0], 1))
            self.w_hidden[layer] -= self._currentLearnRate * change + self.c_hidden[layer] * self.momentum
            self.c_hidden[layer] = change
#         print('Hidden deltas:', hiddenDeltas)
        
        # Update input weights
        change = hiddenDeltas[0] * np.reshape(self.a_input, (self.a_input.shape[0], 1))
        self.w_input -= self._currentLearnRate * change + self.c_input * self.momentum
        self.c_input = change
            
#         print('C_Input:', self.c_input)
        error = np.sum(0.5 * np.square(y - self.a_output))
#         print('Overall error:', error)
#         print('new learning rate:', self._currentLearnRate)
        return error

# Predicting Malignancy of Breast Cancer Cases
## Source: [UCI ML Repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)

In [593]:
breastCancerDf = pd.read_csv('data/breast-cancer-wisconsin.csv')
breastCancerDf['Class_2'] = breastCancerDf.Class.apply(lambda x: 1 if x == 2 else 0)
breastCancerDf['Class_4'] = breastCancerDf.Class.apply(lambda x: 1 if x == 4 else 0)
breastCancerDf.drop(breastCancerDf[breastCancerDf['BareNuclei'] == '?'].index, inplace=True)


yColumn = ['Class_2', 'Class_4']
xColumns = [col for col in breastCancerDf.columns
            if col != 'ID' and col not in yColumn
           and col != 'BareNuclei' and col != 'Class']
display(breastCancerDf[xColumns + yColumn].head())

benignDf = breastCancerDf.loc[breastCancerDf.Class_2 == 1]
malignantDf = breastCancerDf.loc[breastCancerDf.Class_4 == 1]
percentMalignant = len(breastCancerDf['Class'].loc[breastCancerDf['Class_4'] == 1]) / len(breastCancerDf['Class_4'])

malignantTraining = malignantDf.sample(frac=2/3)
malignantDf.drop(malignantTraining.index, inplace=True)
benignTraining = benignDf.sample(n=len(malignantTraining))
benignDf.drop(benignTraining.index, inplace=True)

benignTest = benignDf.sample(n=len(malignantDf) // percentMalignant)
print('Benign training examples: {}, malignant training examples: {}'.format(
        len(benignTraining), len(malignantTraining)))
print('Benign test examples: {}, malignant test examples: {}'.format(
    len(malignantDf), len(benignTest)))

trainSet = pd.concat([malignantTraining, benignTraining])
display(trainSet.head())
xTrain, yTrain = trainSet[xColumns].values, trainSet[yColumn].values
testSet = pd.concat([malignantDf, benignTest])
xTest, yTest = testSet[xColumns].values, testSet[yColumn].values

print('Number of samples:', len(breastCancerDf))
print('% malignant', percentMalignant)

# TODO: 50/50 split by label (Class)
# xTrain, xTest, yTrain, yTest = train_test_split(breastCancerDf[xColumns].as_matrix(),
#                                                breastCancerDf[yColumn].as_matrix(),
#                                                test_size=1/3, random_state=int(time.time()))

Unnamed: 0,ClumpThickness,CellSizeUniformity,CellShapeUniformity,MarginalAdhesion,SingleEpithelialCellSize,BlandChromatin,NormalNucleoli,Mitoses,Class_2,Class_4
0,5,1,1,1,2,3,1,1,1,0
1,5,4,4,5,7,3,2,1,1,0
2,3,1,1,1,2,3,1,1,1,0
3,6,8,8,1,3,3,7,1,1,0
4,4,1,1,3,2,3,1,1,1,0


Benign training examples: 159, malignant training examples: 159
Benign test examples: 80, malignant test examples: 228


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)


Unnamed: 0,ID,ClumpThickness,CellSizeUniformity,CellShapeUniformity,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class,Class_2,Class_4
166,1197993,5,6,7,8,8,10,3,10,3,4,0,1
391,1151734,10,8,7,4,3,10,7,9,1,4,0,1
366,95719,6,10,10,10,8,10,7,10,7,4,0,1
465,1296572,10,9,8,7,6,4,7,10,3,4,0,1
205,1218105,5,10,10,9,6,10,7,10,5,4,0,1


Number of samples: 683
% malignant 0.34992679355783307


In [594]:
nn = NeuralNetwork(8, (1, 8), 2, learnRate=0.01,
                   rateDecay=0.01, momentum=0.5)
nn.fit(xTrain, yTrain)
accuracy, precision, recall, f1 = nn.score(xTest, yTest)
print('Accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format(
        accuracy, precision, recall, f1))

Error=0.05681236873750618, current learning rate=0.009052869546929835
Error=0.23455783495984606, current learning rate=0.008195444703372955
Error=0.4374534743876687, current learning rate=0.007419229177871238
Error=0.1764810021008076, current learning rate=0.006716531388604378
Error=0.30506249743890373, current learning rate=0.006080388246889491
Error=0.4606449738128672, current learning rate=0.005504496159377595
Error=0.7671415417441885, current learning rate=0.004983148565242165
Error=0.12308269714871753, current learning rate=0.00451117938941079
Error=0.21361014980745496, current learning rate=0.004083911851513446
Accuracy: 0.7403, precision: 0.3701, recall: 0.5000, F1: 0.4254


  'precision', 'predicted', average, warn_for)


# Credit Card Fraud Detection
## Source: https://www.kaggle.com/dalpozz/creditcardfraud

In [588]:
ccDf = pd.read_csv('data/creditcard.csv')
ccDf.drop('Unnamed: 0', axis=1, inplace=True)

yCols = []
for val in ccDf.Class.unique():
    print(val)
    yCols.append('Class_' + str(val))
    ccDf['Class_' + str(val)] = ccDf.Class.apply(lambda x: 1 if x == val else 0)
    
ccDf.drop('Class', axis=1, inplace=True)
display(ccDf.head())

xCols = [col for col in ccDf.columns if col not in yCols]
print(xCols)

normalCCDf = ccDf.loc[ccDf.Class_0 == 1]
fraudCCDf = ccDf.loc[ccDf.Class_1 == 1]
percentFraud = len(ccDf.loc[ccDf.Class_1 == 1]) / len(ccDf)
print('Percent fraud samples:', percentFraud)

fraudTraining = fraudCCDf.sample(frac=2/3)
fraudCCDf.drop(fraudTraining.index, inplace=True)
normalTraining = normalCCDf.sample(n=len(fraudTraining))
normalCCDf.drop(normalTraining.index, inplace=True)

normalTest = normalCCDf.sample(n=len(fraudCCDf) // percentFraud)
print('Normal training examples: {}, fraud training examples: {}'.format(
        len(normalTraining), len(fraudTraining)))
print('Normal test examples: {}, fraud test examples: {}'.format(
    len(normalTest), len(fraudCCDf)))

trainSet = pd.concat([fraudTraining, normalTraining])
display(trainSet.head())
xTrain, yTrain = trainSet[xCols].values, trainSet[yCols].values
testSet = pd.concat([fraudCCDf, normalTest])
xTest, yTest = testSet[xCols].values, testSet[yCols].values

# xTrain, xTest, yTrain, yTest = train_test_split(ccDf[xCols].as_matrix(),
#                                                ccDf[yCols].as_matrix(),
#                                                test_size=1/3, random_state=int(time.time()))

0
1


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class_0,Class_1
0,83885.0,0.075672,0.525646,1.182409,-0.63447,-0.503638,-1.103206,0.274049,-1.085006,-0.216463,...,-0.496937,-0.04131,0.777897,0.675924,0.837559,0.040744,0.201063,35.0,1,0
1,162128.0,-0.909553,0.27307,-0.401854,-2.200167,3.732417,3.236887,0.692304,0.420759,0.121275,...,-0.779348,-0.282044,0.707524,-0.144915,0.070718,-0.215493,-0.303012,9.35,1,0
2,153397.0,2.00239,-1.125016,0.376397,-0.162932,-1.730734,-0.195328,-1.474401,0.100887,0.706373,...,0.37478,0.268808,-0.042875,-0.49146,-0.267349,0.099177,-0.014347,30.0,1,0
3,169316.0,1.709112,-0.424756,-0.374813,1.670607,-0.434444,-0.226464,-0.026986,-0.099511,1.006332,...,-1.158619,0.333637,-0.063798,-0.340083,-1.117586,0.043548,-0.008942,119.95,1,0
4,151224.0,1.695085,-0.751412,-1.380952,0.256114,-0.033279,-0.400314,0.281706,-0.26091,0.892732,...,-0.275143,0.06504,0.657225,-0.072854,-0.135481,-0.053881,-0.01585,190.39,1,0


['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
Percent fraud samples: 0.0014922316177546303
Normal training examples: 34, fraud training examples: 34
Normal test examples: 11392, fraud test examples: 17


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  locs = rs.choice(axis_length, size=n, replace=replace, p=weights)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class_0,Class_1
240,109298.0,-1.000611,3.34685,-5.534491,6.835802,-0.299803,0.095951,-2.440419,1.286301,-2.766438,...,-0.694099,0.29966,-0.657601,0.101648,0.430457,0.824685,0.326952,186.13,0,1
812,93904.0,-11.320633,7.19195,-13.179083,9.099552,-10.094749,-2.440115,-14.184337,4.452503,-6.24196,...,-0.350563,0.483044,0.661133,-0.396522,-0.413315,-0.997548,-0.235036,37.93,0,1
23542,35771.0,-3.218952,2.708535,-3.263042,1.361866,-1.645776,-1.852982,-3.069958,-1.796876,-0.213356,...,-0.890421,-0.325814,0.12304,-0.093014,0.232106,-0.310519,-0.745295,60.6,0,1
4873,27187.0,-24.590245,14.044567,-26.278701,6.320089,-18.224513,-4.609968,-17.681003,16.213627,-3.794093,...,-1.804874,-1.140605,0.152234,1.715997,-0.220471,1.434951,0.422492,99.99,0,1
30650,26833.0,-20.532751,12.373989,-23.009003,6.144821,-15.587296,-4.384491,-15.939003,13.696416,-3.948455,...,-1.466115,-0.856779,0.125777,1.402587,-0.223755,1.574249,0.469201,99.99,0,1


In [589]:
creditCardNN = NeuralNetwork(len(xCols), (1, len(xCols)), 2, learnRate=1/np.sqrt(xTrain.shape[0]))
creditCardNN.fit(xTrain, yTrain, 1000)
accuracy, precision, recall, f1 = creditCardNN.score(xTest, yTest)
print('Accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format(
        accuracy, precision, recall, f1))

Error=0.43292551867245227, current learning rate=0.04483406998259172
Error=0.3168139172842443, current learning rate=0.016575658366912594
Error=0.24337240391365633, current learning rate=0.006128206749984567
Error=0.24880853233267383, current learning rate=0.002265666746940287
Error=0.25009858535212454, current learning rate=0.0008376423997450648
Error=0.25041525137488563, current learning rate=0.00030968578710802093
Error=0.25015580473710103, current learning rate=0.00011449430779280405
Error=0.25004083009690686, current learning rate=4.232982933886098e-05
Error=0.25001698213556367, current learning rate=1.5649812522555025e-05
Accuracy: 0.9985, precision: 0.4993, recall: 0.5000, F1: 0.4996


  'precision', 'predicted', average, warn_for)


# Alphabet Letter Recognition
## Source: https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/

In [590]:
letterDf = pd.read_csv('data/letter-recognition.csv').sample(n=10000)
display(letterDf.head())

yCols = []
for val in letterDf.Letter.unique():
    yCols.append('Letter_' + str(val))
    letterDf['Letter_' + str(val)] = letterDf.Letter.apply(lambda x: 1 if x == val else 0)
    
letterDf.drop('Letter', axis=1, inplace=True)

xCols = [col for col in letterDf.columns if col not in yCols]
print(xCols)
print(yCols)

xTrain, xTest, yTrain, yTest = train_test_split(letterDf[xCols].as_matrix(),
                                               letterDf[yCols].as_matrix(),
                                               test_size=1/3, random_state=int(time.time()))

Unnamed: 0,Letter,XBoxPos,YBoxPos,Width,Height,PixelCount,XBar,YBar,X2Bar,Y2Bar,XYBar,X2YBar,XY2Bar,XEdge,XEdgeVY,YEdge,YEdgeVX
1511,I,2,8,2,6,2,7,7,0,7,7,6,8,0,8,3,8
11815,D,4,8,6,6,8,9,8,5,4,7,6,6,4,8,8,6
10269,H,2,1,2,1,2,8,8,5,5,7,6,8,3,8,2,7
4330,K,3,9,4,7,2,3,7,7,3,7,7,11,3,8,3,10
4687,T,2,3,3,2,1,5,12,3,6,11,9,5,1,10,1,5


['XBoxPos', 'YBoxPos', 'Width', 'Height', 'PixelCount', 'XBar', 'YBar', 'X2Bar', 'Y2Bar', 'XYBar', 'X2YBar', 'XY2Bar', 'XEdge', 'XEdgeVY', 'YEdge', 'YEdgeVX']
['Letter_I', 'Letter_D', 'Letter_H', 'Letter_K', 'Letter_T', 'Letter_C', 'Letter_B', 'Letter_P', 'Letter_N', 'Letter_F', 'Letter_A', 'Letter_E', 'Letter_Q', 'Letter_L', 'Letter_O', 'Letter_S', 'Letter_X', 'Letter_M', 'Letter_Z', 'Letter_G', 'Letter_U', 'Letter_V', 'Letter_J', 'Letter_W', 'Letter_Y', 'Letter_R']


In [522]:
letterNN = NeuralNetwork(len(xCols), (1, len(xCols)), 26, learnRate=1/np.sqrt(xTrain.shape[0]),
                        rateDecay=0.01, momentum=0.5)
letterNN.fit(xTrain, yTrain)
accuracy, precision, recall, f1 = letterNN.score(xTest, yTest)
print('Accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format(
        accuracy, precision, recall, f1))

Error=0.45156698387041694, current learning rate=0.011088009963338282
Error=0.49159799984988567, current learning rate=0.01003783077331597
Error=0.4553765464311084, current learning rate=0.009087117252498727
Error=0.4896388355375738, current learning rate=0.008226448704452643
Error=0.4938778103000324, current learning rate=0.0074472966955919685
Error=0.4826052143881146, current learning rate=0.006741940546247569
Error=0.47999067991045247, current learning rate=0.006103390825833609
Error=0.4863718758250434, current learning rate=0.005525320094019999
Error=0.4717015382907057, current learning rate=0.005002000201619316
Accuracy: 0.0354, precision: 0.0014, recall: 0.0385, F1: 0.0026


  'precision', 'predicted', average, warn_for)


# Predicting University Rank Score Ranges
## Kaggle: https://www.kaggle.com/mylesoneill/world-university-rankings

In [591]:
uniDf = pd.read_csv('data/cwurData.csv')

yCols = ['score_below', 'score_above']
excludedCols = ['score', 'year', 'broad_impact', 'country', 'world_rank', 'institution']
xCols = [col for col in uniDf.columns if col not in yCols
        and col not in excludedCols]
meanScore = uniDf.score.mean()
print('Mean score:', meanScore)
uniDf['score_below'] = uniDf.score.map(lambda x: 1 if x < meanScore else 0)
uniDf['score_above'] = uniDf.score.map(lambda x: 1 if x >= meanScore else 0)
uniDf.drop('score', axis=1, inplace=True)
# uniDf['score_below'] = uniDf.score < meanScore
# uniDf['score_above'] = uniDf.score >= meanScore
# uniDf.loc[uniDf.score_below == False, 'score_below'] = 0
# uniDf.loc[uniDf.score_below == True, 'score_below'] = 1
# uniDf.loc[uniDf.score_above == False, 'score_above'] = 0
# uniDf.loc[uniDf.score_above == True, 'score_above'] = 1

display(uniDf.head())

xTrain, xTest, yTrain, yTest = train_test_split(uniDf[xCols].values,
                                               uniDf[yCols].values,
                                               test_size=1/3, random_state=int(time.time()))

Mean score: 47.79839545454546


Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,year,score_below,score_above
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,2012,0,1
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,2012,0,1
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,2012,0,1
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,2012,0,1
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,2012,0,1


In [557]:
uniNN = NeuralNetwork(len(xCols), (1, len(xCols)), 2, learnRate=1/np.sqrt(xTrain.shape[0]),
                        rateDecay=0.01, momentum=0.5)
uniNN.fit(xTrain, yTrain)
accuracy, precision, recall, f1 = uniNN.score(xTest, yTest)
print('Accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}, F1: {:.4f}'.format(
        accuracy, precision, recall, f1))

Error=0.010189503134013206, current learning rate=0.023643908882466053
Error=0.039086534376192306, current learning rate=0.02140452226924608
Error=0.028360868762501414, current learning rate=0.019377234781783922
Error=0.09065195857558864, current learning rate=0.01754195786597212
Error=0.6074283135533199, current learning rate=0.015880505615838526
Error=0.09256807320736149, current learning rate=0.014376414567947278
Error=0.030564298120408926, current learning rate=0.013014780563620832
Error=0.07825969744507671, current learning rate=0.011782111062437733
Error=0.009402388799017483, current learning rate=0.010666191443568767
Accuracy: 0.7820, precision: 0.3910, recall: 0.5000, F1: 0.4388


  'precision', 'predicted', average, warn_for)
