# andbis/basicAlgorithms

initial commit

andbis committed Jul 13, 2019
0 parents commit 659ce5431650be304f65c232c934893924ae83cd
Showing with 606 additions and 0 deletions.
1. +321 −0 library.py
2. +28 −0 readme.txt
3. +257 −0 run.py
 @@ -0,0 +1,321 @@ #library import numpy as np import math from collections import Counter import pandas as pd #min max normalisation def min_max(differences, range=(0,1.0)): #min max max_val = max(differences) min_val = min(differences) return np.multiply(np.subtract(range, range), np.divide( np.subtract(differences, min_val), np.subtract(max_val, min_val))) #zero mean score normalisation def normalizer(data): feature_means = [[] for a in range(data.shape)] feature_sigma = [[] for a in range(data.shape)] for i, mean in enumerate(feature_means): feature_means[i] = data[:,i].mean() feature_sigma[i] = data[:,i].std() normalized = np.empty(data.shape) for i, el in np.ndenumerate(data): normalized[i] = (el - feature_means[i]) / feature_sigma[i] return normalized #Apriori algorithm def apriori(D, min_sup=0.2, delimiter=';'): big_L = [] number_of_t = len(D) #first set of frequent 1 itemsets is found by scanning D k = 1 f = 0 while f != 1: # print('iteration',k) if k == 1: c1 = {} for t in D: if delimiter == ';': splitted = t.split(delimiter) transaction = set(splitted) else: transaction = set(t) for item in transaction: if item in c1: c1[item] += 1 else: c1[item] = 1 l1 = {} for keys in c1.items(): if keys/number_of_t >= min_sup: l1[keys] = keys k += 1 big_L.append(l1) else: #generating next C candidates C = {} if k == 2: L = l1 else: L = new_L c_keys = list(L.keys()) for idx, i in enumerate(c_keys): for indel, j in enumerate(c_keys[idx+1:]): idk = j.split(',') if len(idk) == 1: C[i + ',' + j] = 0 else: for el in idk: if el in i: pass else: C[i+','+el] = 0 #remove Duplicates remove = [] keys2remove = [] deleted = [] for idx, keys in enumerate(C.keys()): splitted_keys = sorted(keys.split(',')) for indel, keys_2 in enumerate(C.keys()): if indel > idx: compares = sorted(keys_2.split(',')) if splitted_keys == compares and idx != indel: if [idx, indel] in remove or [indel, idx] in remove: pass elif indel in deleted: pass else: keys2remove.append(keys_2) remove.append([idx, indel]) deleted.append(indel) #deleting duplicates from C dictionary if len(keys2remove) != 0: for keys in keys2remove: del C[keys] #Scan D for support for t in D: if delimiter == ';': splitted = t.split(delimiter) transaction = set(splitted) else: transaction = set(t) for keys in C.keys(): sub_count = 0 for key in keys.split(','): if key in transaction: sub_count += 1 if sub_count == len(keys.split(',')): C[keys] += 1 #compare candidate support with min_sup new_L = {} for keys in C.items(): if keys/number_of_t >= min_sup: new_L[keys] = keys if len(new_L) != 0: #appending L to big_L if L != empty big_L.append(new_L) else: f = 1 k += 1 return big_L #Euclidean distance used both in K-nn and K-means to calculate the distance between two vectors def euclidean_distance(v1, v2): #returns the euclidean distance from vector one (p) to vector two (q) summed = 0 for p, q in zip(v1, v2): summed += (p - q) ** 2 return np.sqrt(summed) #K_NN functions: #the below two functions and orderedlisttuple class is used to hold nearest neighbours def get (LIST, index): return LIST[index] def get_value(el): return el class OrderedListTuple: #Create a data strutcture with two elements. #A sorted list def __init__(self, max_size): self.content = [] self.max_size = max_size def find_pos (self, element): index = 0 while (index <= len(self.content)-1) and get_value(get(self.content, index)) < get_value(element): index += 1 return index def insert_element (self, element): pos = self.find_pos (element) self.content.insert (pos, element) if len(self.content) > self.max_size: self.content.pop() def k_nen(k, train_x, train_y, test_x): #returns list of predicted labels for data if k % 2 != 1: raise ValueError('Please enter uneven k') #initialising list to hold predicted labels results = [] #iterating over data, using index and element(v1) for idx, v1 in enumerate(test_x): #generating list of tuple to hold K nearest neighbors nearest_neighbours = OrderedListTuple(k) #iterating over data to calculate distance for i, v2 in enumerate(train_x): #calculating the euclidean distance c_dist = euclidean_distance(v1, v2) #adding index and distance to orderedlisttuple nearest_neighbours.insert_element((i, c_dist)) #Initialising dict to hold count of labels in k-nn nearest = {} #iterating over k nearest neigbors to predict label for l in nearest_neighbours.content: c_label = train_y[l] if c_label in nearest: nearest[c_label] += 1 else: nearest[c_label] = 1 #appending most frequent label to results list results.append(max(nearest, key=nearest.get)) return results #Compares two list of labels and returns the accuracy def accuracy(labels_1, labels_2): if len(labels_1) != len(labels_2): raise ValueError('Labels length do not match') return round(len(np.where(np.array(labels_1)== np.array(labels_2)))/len(labels_1),4) #Used to split data set in k parts in cross validation def new_split(data, idx, k=5): size_of_sets = data.shape / k if size_of_sets % 2 != 0: raise ValueError('This splitter only works for splitting equal sized sets') test = data[int(size_of_sets*idx):int((size_of_sets*idx)+size_of_sets)] if idx == 0: train = data[int(size_of_sets):] if idx == k: train = data[:-int(size_of_sets)] else: remainder = np.delete(data, [range(int(idx*size_of_sets), int((size_of_sets*idx)+size_of_sets))], axis=0) train = remainder return test, train #Used to find best_k and sort ascending def best_k(results, kays): classification_error = [] for k in kays: c_k = [a[-1] for a in results if a == k] classification_error.append([k, sum(c_k) / len(c_k)]) average = np.array(classification_error) return average[average[:,1].argsort()] #K_Means functions: #Iterating through data and assigning class by calculating euclidean distance to cluster centers #the closest assigns the same label to data point def assignment(centroids, data): a = data for i, a_vector in enumerate(a): cluster = [int, math.inf] for idx, centroid_vector in enumerate(centroids): c_dist = euclidean_distance(a_vector, centroid_vector) if c_dist < cluster: cluster = [idx, c_dist] a[i,-1] = int(cluster) return a #calculates the mean cluster and returns the new centroids/cluster centers def mean_cluster(centroids, data): k = centroids.shape dimensions = centroids.shape new_centroids = np.zeros(centroids.shape) for i in range(k): #for every cluster cluster = np.array([vector for vector in data if int(vector[-1]) == int(i)]) if cluster.shape != 0: for idx in range(dimensions): new_centroids[i, idx] = np.sum(cluster[:,idx])/len(cluster) return new_centroids #Adds extra column to hold label def add_label_col(data): #Creating a to hold numpy and cluster assignment a = np.zeros((data.shape, data.shape+1)) a[:,:-1] = data return a #Creates c number of centroids in the range of the data def centroid_maker(c, data): centroids = np.empty((c, data.shape)) for i, j in np.ndenumerate(centroids): centroids[i] = np.random.uniform(data[:,i].min(), data[:,i].max()) return centroids #Used to calculate the purity of the clusters NOT PRECISION def purity(labeled_data, labels): #assigns most frequent true class as true class for cluster #compares with number of points in cluster if len(labeled_data.shape) == 1: classes = set(labeled_data) else: classes = set(labeled_data[:,-1]) purity = [] for i in classes: c_class = [] for idx, j in enumerate(labeled_data): if int(i) == int(j[-1]): c_class.append([idx, j[-1]]) true_labels = [] for el in c_class: true_labels.append(labels[el]) most = Counter(true_labels) label_for_cluster = most.most_common(1) countlabel = 0 for b in true_labels: if int(b) == int(label_for_cluster): countlabel += 1 purity.append(countlabel / len(true_labels)) most = 0 return sum(purity) / len(purity)
 @@ -0,0 +1,28 @@ Contact me to get the data Algorithms used: Apriori - frequent pattern mining K-NN - nearest neighbour K-Means - clustering, unsupervised Included in this folder should be the following files: run.py - Script file that should be run library.py - Python library file containing all functions used Data Mining - Spring 2018.csv - data - included because it's only 66KB Following has been uploaded separately: run.pdf - Report of experiments IMPORTANT: It's assumed that file 'Data Mining - Spring 2018.csv' is in the same directory as run.py and library.py, otherwise data won't be found. Program has been developed and tested on following setup: OSX: 10.13.3 Python: 3.6.3 - Anaconda build Packages used: Matplotlib Numpy Pandas Math Collections.counter In addition to what the script file run.py will print, it will also save two figures in the directory. #### 0 comments on commit `659ce54`

