In [47]:
import numpy as np
from numpy.random import uniform
import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import completeness_score

In [48]:
def argmin(array):
    # Check if the input is empty
    if len(array) == 0:
        raise ValueError("argmin() empty sequence")
    
    # Initialize variables to store minimum value and its index
    min_value = array[0]
    min_index = 0
    
    # Iterate through the array to find the minimum value and its index
    for i in range(1, len(array)):
        if array[i] < min_value:
            min_value = array[i]
            min_index = i
    
    return min_index

def total_sum(array):
    # Ensure the input is a numpy array
    array = np.array(array)
    
    # Initialize the sum
    total = 0
    
    # Flatten the array to iterate over all elements
    for element in array.flat:
        total += element
    
    return total

def sum_over_axis_0(array):
    # Ensure the input is a numpy array
    array = np.array(array)
    
    # Get the shape of the array
    rows, cols = array.shape
    
    # Initialize a result array with zeros, having the same number of columns
    result = np.zeros(cols)
    
    # Iterate over each column and compute the sum
    for j in range(cols):
        for i in range(rows):
            result[j] += array[i, j]
    
    return result

def sum_over_axis_1(array):
    # Ensure the input is a numpy array
    array = np.array(array)
    
    # Get the shape of the array
    rows, cols = array.shape
    
    # Initialize a result array with zeros, having the same number of rows
    result = np.zeros(rows)
    
    # Iterate over each row and compute the sum
    for i in range(rows):
        for j in range(cols):
            result[i] += array[i, j]
    
    return result

def mean_axis_0(array):
    # Check if the input is a list of lists or a 2D array
    if not isinstance(array[0], list):
        # If it's a 1D array, return the mean of all elements
        return sum(array) / len(array)
    
    # Ensure the input is not empty
    if len(array) == 0:
        raise ValueError("mean_axis_0() empty sequence")
    
    # Get the number of rows and columns in the array
    rows = len(array)
    cols = len(array[0])
    
    # Initialize a list to store column-wise sums
    column_sums = [0] * cols
    
    # Compute column-wise sums
    for row in array:
        if len(row) != cols:
            raise ValueError("Input rows have inconsistent lengths")
        for i in range(cols):
            column_sums[i] += row[i]
    
    # Compute column-wise means
    means = [sum_i / rows for sum_i in column_sums]
    
    return means

In [49]:
# Calculează distanța euclidiană între un punct și un set de date
def euclidean(point, data):
    return np.sqrt(sum_over_axis_1((point - data) ** 2))

class MyKMeans:
    def __init__(self, n_clusters=8, max_iter=750):
        self.centroids = None
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def fit(self, X):
        self.centroids = [random.choice(X)]  # Inițializează centroizii cu un punct aleatoriu
        for _ in range(self.n_clusters - 1):
            # Calculează distanțele de la puncte la centroizi
            dists = sum_over_axis_0([euclidean(centroid, X) for centroid in self.centroids])
            dists /= total_sum(dists)  # Normalizează distanțele
            new_centroid_index, = np.random.choice(range(len(X)), size=1, p=dists)
            # Alege punctele rămase în funcție de distanțele lor
            self.centroids += [X[new_centroid_index]]

        iteration = 0
        prev_centroids = None
        while np.not_equal(self.centroids, prev_centroids).any() and iteration < self.max_iter:
            sorted_points = [[] for _ in range(self.n_clusters)]  # Atribuie fiecare dată celui mai apropiat centroid
            for x in X:
                dists = euclidean(x, self.centroids)
                centroid_index = argmin(dists)  # Alege distanța minimă
                sorted_points[centroid_index].append(x)
            prev_centroids = self.centroids
        
            self.centroids = [mean_axis_0(cluster) for cluster in sorted_points]
            # Realocă centroizii ca medie a punctelor care aparțin lor

            for i, centroid in enumerate(self.centroids):
                if np.isnan(centroid).any():
                    self.centroids[i] = prev_centroids[i]
            iteration += 1

    def evaluate(self, X):
        centroids = []
        centroid_indexes = []
        for x in X:
            dists = euclidean(x, self.centroids)
            centroid_index = argmin(dists)
            centroids.append(self.centroids[centroid_index])
            centroid_indexes.append(centroid_index)
        return centroids, centroid_indexes

In [50]:
# Încarcă datele pentru setul de emoții din recenzii
filename = 'data/reviews_mixed.csv'

file = pd.read_csv(filename, encoding="ISO-8859-1")
inputs = [value for value in file["Text"]]  # Datele de intrare
outputs = [value for value in file["Sentiment"]]  # Datele de ieșire
labelsNames = list(set(outputs))  # Numele etichetelor de ieșire

In [51]:
def train_and_test(input_data, output_data):
    indexes = [i for i in range(len(input_data))]
    train_sample = np.random.choice(indexes, int(0.8 * len(input_data)), replace=False)
    test_sample = [i for i in indexes if i not in train_sample]
    train_inputs = [input_data[i] for i in train_sample]
    train_outputs = [output_data[i] for i in train_sample]
    test_inputs = [input_data[i] for i in test_sample]
    test_outputs = [output_data[i] for i in test_sample]
    return train_inputs, train_outputs, test_inputs, test_outputs

# Separă datele în setul de antrenare și setul de testare
trainInputs, trainOutputs, testInputs, testOutputs = train_and_test(inputs, outputs)

In [52]:
# Extrage caracteristicile folosind reprezentarea TF-IDF(Term Frequency - Inverse Document Frequency)
def extract_features_tf_idf(train_inputs, test_inputs, max_features):
    vec = TfidfVectorizer(max_features=max_features)  # Inițializează vectorizatorul TfidfVectorizer cu numărul maxim de caracteristici
    train_features = vec.fit_transform(train_inputs)  # Transformă datele de antrenament în caracteristici
    test_features = vec.transform(test_inputs)  # Transformă datele de test în caracteristici folosind aceeași transformare
    return train_features.toarray(), test_features.toarray()  # Returnează caracteristicile ca matrice numpy

# Extrage caracteristicile utilizând metoda "extract_features_tf_idf"
trainFeatures, testFeatures = extract_features_tf_idf(trainInputs, testInputs, 150)

In [58]:
# Prezicerea utilizând o implementare proprie a algoritmului K-means
def predict_by_me(train_features, test_features, label_names, classes):
    my_unsupervised_classifier = MyKMeans(n_clusters=classes)  # Inițializează clasificatorul K-means personalizat
    my_unsupervised_classifier.fit(train_features)  # Antrenează clasificatorul utilizând caracteristicile de antrenament
    my_centroids, computed_indexes = my_unsupervised_classifier.evaluate(test_features)  # Evaluează caracteristicile de test
    computed_outputs = [label_names[value] for value in computed_indexes]  # Converteste indicii prezise în etichete
    return computed_outputs, my_centroids, computed_indexes

# Prezice rezultatele utilizând metoda "predict_by_me"
myComputedOutputs, centroids, computedIndexes = \
    predict_by_me(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))

In [59]:
# Inversează etichetele de testare pentru calcularea scorului de acuratețe inversat
inverseTestOutputs = ['negative' if elem == 'positive' else 'positive' for elem in testOutputs]

# Calculează scorurile de acuratețe pentru metoda "predict_by_me"
accuracyByMe = accuracy_score(testOutputs, myComputedOutputs)
accuracyByMeInverse = accuracy_score(inverseTestOutputs, myComputedOutputs)
print('Accuracy score by me:', max(accuracyByMe, accuracyByMeInverse))

# Afișează din rezultatele prezise de metoda "predict_by_me", și etichetele reale
print('Output computed by me:    ', myComputedOutputs[:7])
print('Real output:              ', testOutputs[:7])

Accuracy score by me: 0.6904761904761905
Output computed by me:     ['positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive']
Real output:               ['negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative']
