questions:
1. what is logistic regression used for in this project?
2. what is naive bayes used for in this project?
3. what metrics and attributes are we using with knn?

# Dependencies

In [1]:
#import pandas for table and csv handling, randomizing dataset
import pandas

#import numpy for fast math calculations
import numpy

#import KFold from SKLearn
from sklearn.model_selection import KFold

#import accuracu from SKLearn
from sklearn.metrics import accuracy_score

In [2]:
def crossEvaluation(filename, k=5):

    #import data
    dataset = importData(filename)

    #get split value
    split = round(len(dataset.index) * (1/k))
    
    #shuffle data
    dataset = dataset.sample(frac=1)

    #split dataset
    testing = dataset.loc[:split]
    training = dataset.loc[split:]

    #use kfold
    kfold = KFold(n_splits=k)

    #kfold split
    for train_index, validation_index in kfold.split(training):

        #split into training and validation
        sub_valid = training.iloc[validation_index]
        sub_train = training.iloc[train_index]

        #get spam and not spam probabilities
        spam_dict, not_spam_dict = getSpamWordProbabilities(sub_train)
        spam_prob = getSpamProbability(sub_train)

        #run model evaluation
        modelEvaluation(sub_valid, spam_dict, not_spam_dict, spam_prob)

        #try knn
        knn = KNN(sub_train, k=k)

        knn.predict(testing)

#crossEvaluation("spambase.csv")

In [3]:
def crossEvaluation(filename, k=5):

    #import data
    dataset = importData(filename)

    #get split value
    split = round(len(dataset.index) * (1/k))
    
    #shuffle data
    dataset = dataset.sample(frac=1)

    #split dataset
    testing = dataset.loc[:split]
    training = dataset.loc[split:]

    #use kfold
    kfold = KFold(n_splits=k)

    #kfold split
    for train_index, validation_index in kfold.split(training):

        #split into training and validation
        sub_valid = training.iloc[validation_index]
        sub_train = training.iloc[train_index]

        #get spam and not spam probabilities
        spam_dict, not_spam_dict = getSpamWordProbabilities(sub_train)
        spam_prob = getSpamProbability(sub_train)

        #run model evaluation
        modelEvaluation(sub_valid, spam_dict, not_spam_dict, spam_prob)

        #try knn
        knn = KNN(sub_train, k=k)

        knn.predict(testing)

#crossEvaluation("spambase.csv")

# Dataset

In [4]:
def importData(filename):

  #import file
  return pandas.read_csv(filename)

# Naive Bayes

In [5]:
def removeAttributes(dataset):

    #drop capital run length average, longest, and total
    dataset = dataset.drop("capital_run_length_average", axis='columns')
    dataset = dataset.drop("capital_run_length_longest", axis='columns')
    dataset = dataset.drop("capital_run_length_total", axis='columns')

    return dataset

In [6]:
def roundToOne(dataset):
    return dataset.mask(dataset > 0, 1)

In [7]:
def getSpamOccurrences(dataset):

    #positive = spam
    #negative = not spam

    #get all rows where spam value is set to 1 and 0 respectively
    spam = dataset.loc[dataset['spam'] == 1]
    not_spam = dataset.loc[dataset['spam'] == 0]

    #get total occurrences of spam, not spam, and total
    spam_length = len(spam.index)
    not_spam_length = len(not_spam.index)
    total_length = spam_length + not_spam_length

    #return tuple with the number of spam and not spam emails in the dataset
    return (spam_length, not_spam_length, total_length)

In [8]:
def getSpamProbability(dataset):
    spam, not_spam, total = getSpamOccurrences(dataset)
    return (spam / total)


In [9]:
def getDatasetLength(dataset):
    return len(dataset.index)

In [10]:
def getWordProbabilities(dataset):

    #round all non-zero values to 1
    dataset = roundToOne(dataset)

    #define dictionary for storing word probabilities
    word_probability = dict()

    #get number of unique features (for laplace smoothing)
    unique = len(dataset.columns)

    #iterate through columns
    for column in dataset.columns:

        #sum all values in the current column
        #add 1 for laplace smoothing
        #divide by the dataset length
        #add number of unique values (number of columns)
        #to the dataset length for laplace smoothing
        word_probability[column] = ((dataset[column].sum() + 1) 
        / (getDatasetLength(dataset) + unique))

    #return word probability dictionary
    return word_probability

In [11]:
def getSpamWordProbabilities(dataset):

    spam = dataset.loc[dataset['spam'] == 1]
    not_spam = dataset.loc[dataset['spam'] == 0]

    spam_probabilities = getWordProbabilities(spam)
    not_spam_probabilities = getWordProbabilities(not_spam)

    return (spam_probabilities, not_spam_probabilities)



# KNN
Use parallel programming

In [12]:
def modelEvaluation(dataset, spam_probabilities, not_spam_probabilities, spam_probability):

    #get spam probability

    predicted_list = []

    for index, email in dataset.iterrows():

        #reset spam predicted
        spam_predicted = 1
        not_spam_predicted = 1

        for column in dataset.iloc[:,:-1]:

            if email[column] > 0.:
                spam_predicted *= spam_probabilities[column]
                not_spam_predicted *= not_spam_probabilities[column]
                
        spam_predicted = spam_predicted * spam_probability
        not_spam_predicted = not_spam_predicted * (1 - spam_probability)

        if spam_predicted > not_spam_predicted:
            predicted_list.append(1)
        else:
            predicted_list.append(0)

    print(accuracy_score(predicted_list, dataset['spam']))

                

In [13]:
class KNN:

    def __init__(self, dataset, k=3):
        self.k = k

        #x_train = attributes
        self.x_train = dataset.iloc[:,:-1]

        #y_train = label
        self.y_train = dataset.iloc[:,-1]

    def _cosine_distance(self, vector1, vector2):

        #define subfunction for getting dot product
        #a dot product is the sum of the product of all components of two vectors
        #print(vector1, vector2)
        return (1 - distance.cosine(vector1, vector2))

    def predict(self, testing):
        testing = testing.iloc[:,:-1]
        print("starting...")
        distances = cosine_distances(self.x_train, testing)
        #get k closest
        print("ending...")
        print(distances)


    def getDistances(self, x):
        #compute distance using cosine
        distances = [self._cosine_distance(x, x_train) for index, x_train in self.x_train.iterrows()]

        #get closest k
        k_indices = numpy.argsort(distances)[:self.k]
        k_labels = [self.y_train.to_numpy()[index] for index in k_indices]
        
        return max(set(k_labels), key=k_labels.count)




In [40]:
from multiprocessing import Process, Queue

import dill


def mp_worker(queue):

    while queue.qsize() >0 :
        record = queue.get()
        print(record)

    print("worker closed")

class KNN2:

    def __init__(self, dataset, k=5):

        self.k = k

        self.training = dataset.iloc[:,:-1]
        self.label = dataset.iloc[:,-1]

    def distance(self, vector1, vector2):

        dot = numpy.dot
        norm = numpy.linalg.norm

        cos_similarity = (dot(vector1, vector2) / (norm(vector1) * norm(vector2)))
        return (1 - cos_similarity)
    
    def getDistancesFromPoint(self, vector):

        #the vector is essentially a single row in the dataframe
        #all attributes except those that have been dropped
        #and the label itself are used as part of the cosine distance formula
        distances = []

        #iterate through rows in dataframe
        #for index in self.dataset[0].index:

        for index in self.training.index:

            #get current point from dataset to get distance from
            vector2 = self.training.iloc[index]

            #calculate distance
            distance = self.distance(vector, vector2)

            #append to distance list in dataset
            distances.append((distance, self.label[index]))

        #return distances
        return distances
    
    

    def multiDistances(self, vector, num_processes=10):



        def splitIntoChunks(dataset, num_processes):

            dataset_size = len(dataset.index)
            chunk_size = 1#dataset_size // num_processes
            chunks = []

            #iterate from start to end of dataset by chunk_size
            for index in range(0, dataset_size, chunk_size):
                chunks.append(dataset.iloc[index:index + chunk_size].numpy())

            return chunks

        
        #chunks = splitIntoChunks(self.training, num_processes)
        queue = Queue()
        for index in range(0, len(self.training.index)):
            queue.put(self.training.iloc[index: index+1])
        processes = [Process(target=mp_worker, args=(queue,)) for _ in range(2)]

        for process in processes:
            process.start()
            print('Process started')

        for process in processes:
            process.join()


        #distances = [item for sublist in results for item in sublist]

        #pool.close()
        #pool.join()

        #sort distances list based on 

        """
        distances.sort(key=lambda x: x[0])
        sorted_labels = [label for distance, label in distances]
        """

    def predict(self, vector):

        def findMajority(labels):

            #create occurrences array
            occurrences = []

            #create set of all possible labels
            label_types = set(labels)

            #iterate through label types
            for label in label_types:

                #append label and count of label
                occurrences.append((label, labels.count(label)))

            #find majority label by count using lambda function
            majority = max(occurrences, key=lambda count: count[1])

            #return majority
            return majority

        #labels = self.getDistancesFromPoint(vector)[:self.k]
        self.multiDistances(vector)
        #return findMajority(labels)[0]
            


In [41]:
data = importData("spambase.csv")
knn = KNN2(data)

def testKNN():

    vector1 = data.iloc[:,:-1].iloc[3]
    vector2 = data.iloc[:,:-1].iloc[1]

    majority = knn.predict(vector2)

    print(majority)

    
if __name__ == '__main__':
    testKNN()

Process started
Process started
None


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'mp_worker' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'mp_worker' on <module '__main__' (built-in)>


# Logistic Regression

In [21]:
def linearRegression(dataset):

    #add column to dataset
    dataset = addOneColumn(dataset)

    #create randomized vector m
    vector_m = getVectorM(len(dataset.columns))

    for i in range(1):

        #multiply vector_m by dataset
        dataset.dot(vector_m)

        #apply sigmoid function
        predicted_y = dataset.apply(sigmoid)

        #each line will have a predicted y
        #it will be 1 for each row
        (dataset * (predicted_y - dataset)) * (2 / getDatasetLength(dataset))

        print(predicted_y)

        currentPerformance = performance()



In [81]:
class LogisticRegression:

    def __init__(self, dataset, iterations=100):
        self.training = dataset.iloc[:,:-1]
        self.validation = dataset.iloc[:,:-1]
        self.label = dataset.iloc[:,-1]
        self.iterations = iterations

    def addBias(self):

        #add column of entirely 1's to add bias
        self.dataset["bias"] = 1

        #return modified dataset
        return self.dataset

    #create sigmoid function
    def sigmoid(value):
        
        #return 1 divided by 1 + e to the power of - value
        return 1.0 / ( 1 + numpy.exp( - value ) )
    
    #create decision function
    def decide(value, boundary=0.5):

        #if value is greater than boundary, return true
        #otherwise, return false
        return (value > boundary)
    
    def performance(self, model):

        print(type(model))

        return accuracy_score(self.label, self.decide(model))

    def getBestModel(self):

        #m represents the slope
        #to start, we generate a random m
        #then, we continue to modify m until we reach a convergence point

        def getInitialM(dataset):
            return numpy.random.randn(len(dataset.columns))
        
        def getGradient(dataset, label, predicted_y):
            #print(dataset.shape, len(predicted_y), len(label))
            return numpy.dot(dataset.T, (predicted_y - label)) * (2 / len(dataset.index))
        
        vector_m = getInitialM(self.training)
        best_performance = 0

        #iterate
        for iteration in range(self.iterations):

            #get predicted y
            #print(vector_m.shape, self.training.shape)
            predicted_y = numpy.dot(vector_m, self.training.T)

            gradient = getGradient(self.training, self.label, predicted_y)
            print(gradient)

            print(self.performance(gradient))

    def predict(self, vector_m):

        predicted_y = numpy.dot(vector_m, self.training.T)
        return self.sigmoid(predicted_y)

        #model is a vector of 57 columns, 
        #the result of the dot product of the dataset and the predicted_y

        


            



In [None]:
import numpy as np
class LogisticRegression2:

    def __init__(self, learning_rate=0.001, iterations=2000):
        self.learning_rate = learning_rate
        self.iterartions = iterations
        self.weights = None
        self.bias = None
        self.threshold

    @staticmethod
    def sigmoid(value):
        return 1 / (1 + np.exp( - value))

    def fit(self, data, label):

        sigmoid = LogisticRegression2.sigmoid

        samples, features = data.shape
        self.weights = [0] * features
        self.bias = 0

        for iteration in range(self.iterartions):
            linear_pred = np.dot(data, self.weights) + self.bias
            predictions = sigmoid(linear_pred)

            gradient_weight = (1/samples) * np.dot(data.T, (predictions - label))
            gradient_bias = (1/samples) * np.sum(predictions - label)

            self.weights = self.weights - self.learning_rate * gradient_weight
            self.bias = self.bias - self.learning_rate * gradient_bias


    def predict(self, data, threshold=0.5):

        sigmoid = LogisticRegression2.sigmoid

        linear_pred = np.dot(data, self.weights) + self.bias
        y_predictions = sigmoid(linear_pred)

        predictions = [0 if y <= threshold else 1 for y in y_predictions]

        return predictions


In [None]:
def splitData(dataset):

    x = dataset.iloc[:,:-1]
    y = dataset.iloc[:,-1]

    return x, y

In [None]:
def train():

    data = importData("spambase.csv")
    test_data, test_label = splitData(data)

    def accuracy(y_pred, y_label):
        return np.sum(y_pred==y_label) / len(y_label)


    lr = LogisticRegression2()
    lr.fit(data, label)
    predictions = lr.predict(test_data, test_label)

    acc = accuracy(predictions, y_label) 

In [82]:
def test():
    dataset = importData("spambase.csv")
    lr = LogisticRegression(dataset, iterations=1)
    lr.getBestModel()

test()

[-1.83634502e+02 -1.73199498e+02 -4.10022947e+02 -1.44648132e+02
 -3.65783838e+02 -1.65330845e+02 -1.27529283e+02 -1.59338817e+02
 -2.71578827e+02 -4.14471036e+02 -1.24288533e+02 -6.53912392e+02
 -1.80069790e+02 -1.98790413e+02 -1.59380661e+02 -2.92140424e+02
 -2.32391282e+02 -2.74343505e+02 -1.83959848e+03 -1.97992055e+02
 -1.07687357e+03 -3.82053950e+02 -2.56758801e+02 -1.91357464e+02
 -4.40426333e+02 -1.70526513e+02 -8.23922397e+01 -5.56059263e+01
 -3.09785673e+01 -4.72986865e+01 -3.01083222e+01 -1.75258727e+01
 -1.13578677e+02 -2.02679448e+01 -6.15156609e+01 -6.58572997e+01
 -1.46996441e+02 -7.34157925e+00 -3.68651457e+01 -5.14283358e+01
 -2.54818516e+01 -4.30029079e+01 -3.27430106e+01 -2.72485751e+01
 -1.10425338e+02 -9.82880736e+01 -6.78272259e+00 -2.80072174e+01
 -7.63819720e+01 -2.54599659e+02 -2.01749278e+01 -3.88280340e+02
 -2.09338280e+02 -9.82011261e+01 -2.16292465e+04 -3.22543097e+05
 -1.75938952e+06]
<class 'numpy.ndarray'>


TypeError: '<' not supported between instances of 'float' and 'LogisticRegression'

# Testing

In [23]:
def testDataset(training, test):
  pass

In [24]:
def main():

  #split the dataset

  #use optimize k to find a k value

  #build knn graph

  #use the validation dataset to test the accuracy of the program

  pass