questions:
1. what is logistic regression used for in this project?
2. what is naive bayes used for in this project?
3. what metrics and attributes are we using with knn?

# Dependencies

In [3]:
#import pandas for table and csv handling, randomizing dataset
import pandas

#import numpy for fast math calculations
import numpy

#import KFold from SKLearn
from sklearn.model_selection import KFold

#import accuracu from SKLearn
from sklearn.metrics import accuracy_score

In [4]:
def crossEvaluation(filename, k=5):

    #import data
    dataset = importData(filename)

    #get split value
    split = round(len(dataset.index) * (1/k))
    
    #shuffle data
    dataset = dataset.sample(frac=1)

    #split dataset
    testing = dataset.loc[:split]
    training = dataset.loc[split:]

    #use kfold
    kfold = KFold(n_splits=k)

    #kfold split
    for train_index, validation_index in kfold.split(training):

        #split into training and validation
        sub_valid = training.iloc[validation_index]
        sub_train = training.iloc[train_index]

        #get spam and not spam probabilities
        spam_dict, not_spam_dict = getSpamWordProbabilities(sub_train)
        spam_prob = getSpamProbability(sub_train)

        #run model evaluation
        modelEvaluation(sub_valid, spam_dict, not_spam_dict, spam_prob)

        #try knn
        knn = KNN(sub_train, k=k)

        knn.predict(testing)

#crossEvaluation("spambase.csv")

In [5]:
def crossEvaluation(filename, k=5):

    #import data
    dataset = importData(filename)

    #get split value
    split = round(len(dataset.index) * (1/k))
    
    #shuffle data
    dataset = dataset.sample(frac=1)

    #split dataset
    testing = dataset.loc[:split]
    training = dataset.loc[split:]

    #use kfold
    kfold = KFold(n_splits=k)

    #kfold split
    for train_index, validation_index in kfold.split(training):

        #split into training and validation
        sub_valid = training.iloc[validation_index]
        sub_train = training.iloc[train_index]

        #get spam and not spam probabilities
        spam_dict, not_spam_dict = getSpamWordProbabilities(sub_train)
        spam_prob = getSpamProbability(sub_train)

        #run model evaluation
        modelEvaluation(sub_valid, spam_dict, not_spam_dict, spam_prob)

        #try knn
        knn = KNN(sub_train, k=k)

        knn.predict(testing)

#crossEvaluation("spambase.csv")

# Dataset

In [6]:
def importData(filename):

  #import file
  return pandas.read_csv(filename)

# Naive Bayes

In [7]:
def removeAttributes(dataset):

    #drop capital run length average, longest, and total
    dataset = dataset.drop("capital_run_length_average", axis='columns')
    dataset = dataset.drop("capital_run_length_longest", axis='columns')
    dataset = dataset.drop("capital_run_length_total", axis='columns')

    return dataset

In [8]:
def roundToOne(dataset):
    return dataset.mask(dataset > 0, 1)

In [9]:
def getSpamOccurrences(dataset):

    #positive = spam
    #negative = not spam

    #get all rows where spam value is set to 1 and 0 respectively
    spam = dataset.loc[dataset['spam'] == 1]
    not_spam = dataset.loc[dataset['spam'] == 0]

    #get total occurrences of spam, not spam, and total
    spam_length = len(spam.index)
    not_spam_length = len(not_spam.index)
    total_length = spam_length + not_spam_length

    #return tuple with the number of spam and not spam emails in the dataset
    return (spam_length, not_spam_length, total_length)

In [10]:
def getSpamProbability(dataset):
    spam, not_spam, total = getSpamOccurrences(dataset)
    return (spam / total)


In [11]:
def getDatasetLength(dataset):
    return len(dataset.index)

In [12]:
def getWordProbabilities(dataset):

    #round all non-zero values to 1
    dataset = roundToOne(dataset)

    #define dictionary for storing word probabilities
    word_probability = dict()

    #get number of unique features (for laplace smoothing)
    unique = len(dataset.columns)

    #iterate through columns
    for column in dataset.columns:

        #sum all values in the current column
        #add 1 for laplace smoothing
        #divide by the dataset length
        #add number of unique values (number of columns)
        #to the dataset length for laplace smoothing
        word_probability[column] = ((dataset[column].sum() + 1) 
        / (getDatasetLength(dataset) + unique))

    #return word probability dictionary
    return word_probability

In [13]:
def getSpamWordProbabilities(dataset):

    spam = dataset.loc[dataset['spam'] == 1]
    not_spam = dataset.loc[dataset['spam'] == 0]

    spam_probabilities = getWordProbabilities(spam)
    not_spam_probabilities = getWordProbabilities(not_spam)

    return (spam_probabilities, not_spam_probabilities)



# KNN
Use parallel programming

In [14]:
def modelEvaluation(dataset, spam_probabilities, not_spam_probabilities, spam_probability):

    #get spam probability

    predicted_list = []

    for index, email in dataset.iterrows():

        #reset spam predicted
        spam_predicted = 1
        not_spam_predicted = 1

        for column in dataset.iloc[:,:-1]:

            if email[column] > 0.:
                spam_predicted *= spam_probabilities[column]
                not_spam_predicted *= not_spam_probabilities[column]
                
        spam_predicted = spam_predicted * spam_probability
        not_spam_predicted = not_spam_predicted * (1 - spam_probability)

        if spam_predicted > not_spam_predicted:
            predicted_list.append(1)
        else:
            predicted_list.append(0)

    print(accuracy_score(predicted_list, dataset['spam']))

                

In [15]:
class KNN:

    def __init__(self, dataset, k=3):
        self.k = k

        #x_train = attributes
        self.x_train = dataset.iloc[:,:-1]

        #y_train = label
        self.y_train = dataset.iloc[:,-1]

    def _cosine_distance(self, vector1, vector2):

        #define subfunction for getting dot product
        #a dot product is the sum of the product of all components of two vectors
        #print(vector1, vector2)
        return (1 - distance.cosine(vector1, vector2))

    def predict(self, testing):
        testing = testing.iloc[:,:-1]
        print("starting...")
        distances = cosine_distances(self.x_train, testing)
        #get k closest
        print("ending...")
        print(distances)


    def getDistances(self, x):
        #compute distance using cosine
        distances = [self._cosine_distance(x, x_train) for index, x_train in self.x_train.iterrows()]

        #get closest k
        k_indices = numpy.argsort(distances)[:self.k]
        k_labels = [self.y_train.to_numpy()[index] for index in k_indices]
        
        return max(set(k_labels), key=k_labels.count)




In [63]:
import multiprocessing as multi

import dill

class KNN2:

    def __init__(self, dataset, k=5):

        self.k = k

        self.training = dataset.iloc[:,:-1]
        self.label = dataset.iloc[:,-1]

    def distance(self, vector1, vector2):

        dot = numpy.dot
        norm = numpy.linalg.norm

        cos_similarity = (dot(vector1, vector2) / (norm(vector1) * norm(vector2)))
        return (1 - cos_similarity)
    
    def getDistancesFromPoint(self, vector):

        #the vector is essentially a single row in the dataframe
        #all attributes except those that have been dropped
        #and the label itself are used as part of the cosine distance formula
        distances = []

        #iterate through rows in dataframe
        #for index in self.dataset[0].index:

        for index in self.training.index:

            #get current point from dataset to get distance from
            vector2 = self.training.iloc[index]

            #calculate distance
            distance = self.distance(vector, vector2)

            #append to distance list in dataset
            distances.append((distance, self.label[index]))

        #return distances
        return distances
    
    def testfunc(self, chunk, vector):
        return 1

    def multiDistances(self, vector, num_processes=10):



        def splitIntoChunks(dataset, num_processes):

            dataset_size = len(dataset.index)
            chunk_size = dataset_size // num_processes
            chunks = []

            #iterate from start to end of dataset by chunk_size
            for index in range(0, dataset_size, chunk_size):
                chunks.append(dataset.iloc[index:index + chunk_size])

            return chunks

        
        chunks = splitIntoChunks(self.training, num_processes)

        pool = multi.Pool(processes=num_processes)

        results = pool.starmap(self.testfunc, [(chunk, vector) for chunk in chunks])

        #distances = [item for sublist in results for item in sublist]

        #pool.close()
        #pool.join()

        #sort distances list based on 

        """
        distances.sort(key=lambda x: x[0])
        sorted_labels = [label for distance, label in distances]
        """

    def predict(self, vector):

        def findMajority(labels):

            #create occurrences array
            occurrences = []

            #create set of all possible labels
            label_types = set(labels)

            #iterate through label types
            for label in label_types:

                #append label and count of label
                occurrences.append((label, labels.count(label)))

            #find majority label by count using lambda function
            majority = max(occurrences, key=lambda count: count[1])

            #return majority
            return majority

        #labels = self.getDistancesFromPoint(vector)[:self.k]
        self.multiDistances(vector)
        #return findMajority(labels)[0]
            


In [64]:
data = importData("spambase.csv")
knn = KNN2(data)

def testKNN():

    vector1 = data.iloc[:,:-1].iloc[3]
    vector2 = data.iloc[:,:-1].iloc[1]

    majority = knn.predict(vector2)

    print(majority)

    
if __name__ == '__main__':
    testKNN()

Process SpawnPoolWorker-366:
Traceback (most recent call last):
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'KNN2' on <module '__main__' (built-in)>
Process SpawnPoolWorker-367:
Traceback (most recent call last):
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/Alex/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/Alex/opt/anaconda3/lib/pyt

KeyboardInterrupt: 

# Linear Regression

In [None]:
def sigmoid(value):
    return 1 / (1 + numpy.exp(-value))

In [None]:
def addOneColumn(dataset):

    #return dataset with new column with all values initialized to one
    dataset["added_column"] = 1
    return dataset

In [None]:
def getVectorM(lengthOfDataset):

    #vector is represented as an array of numbers
    #amount of numbers is determined by number of columns in dataset dataframe
    #the vector should be one-dimensional

    #first parameter = number of elements in first array
    #example code has 1 passed in to signify this is a one-dimensional array

    return numpy.random.randn(lengthOfDataset)

In [None]:
def splitData(dataset):

    #drop label column for predicted
    dataset = dataset.drop(columns=['spam'])
    label = dataset['spam']

    return dataset, label

In [None]:
def performance(true, predicted):
    pass

In [None]:
def linearRegression(dataset):

    #add column to dataset
    dataset = addOneColumn(dataset)

    #create randomized vector m
    vector_m = getVectorM(len(dataset.columns))

    for i in range(1):

        #multiply vector_m by dataset
        dataset.dot(vector_m)

        #apply sigmoid function
        predicted_y = dataset.apply(sigmoid)

        #each line will have a predicted y
        #it will be 1 for each row
        (dataset * (predicted_y - dataset)) * (2 / getDatasetLength(dataset))

        print(predicted_y)

        currentPerformance = performance()



In [None]:
def test():
    dataset = importData("spambase.csv")
    dataset = removeAttributes(dataset)
    dataset = addOneColumn(dataset)

test()

# Testing

In [None]:
def testDataset(training, test):
  pass

In [None]:
def main():

  #split the dataset

  #use optimize k to find a k value

  #build knn graph

  #use the validation dataset to test the accuracy of the program

  pass