In [7]:
directory = 'dataset'
dataset_files = ['2017 Q1.arff',
                '2017 Q2.arff',
                '2017 Q3.arff',
                '2017 Q4.arff',
                '2017.arff',
                '2018 Q1.arff',
                '2018 Q2.arff',
                '2018 Q3.arff',
                '2018 Q4.arff',
                '2018.arff',
                '2019 Q1.arff',
                '2019 Q2.arff',
                '2019 Q3.arff',
                '2019 Q4.arff',
                '2019.arff',
                '2020 Q1.arff',
                '2020 Q2.arff',
                '2020 Q3.arff',
                '2020 Q4.arff',
                '2020.arff',
                '2021 Q1.arff'] 

def load_arff(file_path):
    attributes = []
    data = []

    with open(file_path, 'r') as file:
        data_started = False
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue

            if line.lower().startswith('@relation'):
                continue

            if line.lower().startswith('@attribute'):
                attr_name, attr_type, attr_values = parse_attribute(line)
                attributes.append((attr_name, attr_type, attr_values))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data.append(line.split(','))

    return attributes, data

def parse_attribute(line):
    parts = line.split()
    attr_name = parts[1].strip()

    if '{' in line:
        values = line[line.index('{') + 1:line.index('}')].split(',')
        attr_type = 'nominal'
        attribute_info = (attr_name, attr_type, values)
    else:
        attr_type = 'numeric'
        attribute_info = (attr_name, attr_type, 0)

    return attribute_info

def standardize_data(matrix):
    means = [mean(col) for col in zip(*matrix)]
    std_devs = [std_dev(col) for col in zip(*matrix)]
    return [[(col[i] - means[i]) / std_devs[i] for i in range(len(col))] for col in matrix]

def mean(vector):
    return sum(vector) / len(vector)

def std_dev(vector):
    mean_val = mean(vector)
    variance = sum((x - mean_val) ** 2 for x in vector) / len(vector)
    return math.sqrt(variance)

def calculate_covariance(matrix):
    n = len(matrix)
    num_features = len(matrix[0])
    cov_matrix = [[0] * num_features for _ in range(num_features)]

    for i in range(num_features):
        for j in range(num_features):
            mean_i = mean(matrix[i])
            mean_j = mean(matrix[j])
            cov_matrix[i][j] = covariance(matrix[i], matrix[j], mean_i, mean_j, n)

    return cov_matrix

def covariance(x, y, mean_x, mean_y, n):
    return sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n)) / (n - 1)

def eigenvalues_and_eigenvectors(matrix, num_simulations=1000):
    n = len(matrix)
    vec = [1] * n

    for _ in range(num_simulations):
        new_vec = multiply(matrix, vec)
        vec = normalize(new_vec)

    eigenvalue = dot_product(multiply(matrix, vec), vec)
    eigenvector = vec

    return eigenvalue, eigenvector

def normalize(vector):
    norm = math.sqrt(sum(x ** 2 for x in vector))
    return [x / norm for x in vector]

def multiply(matrix, vector):
    return [dot_product(row, vector) for row in matrix]

def dot_product(vector1, vector2):
    return sum(x * y for x, y in zip(vector1, vector2))

def transform(matrix, eigenvectors, k):
    return [
        [dot_product(row, eigenvectors[i]) for i in range(k)]
        for row in matrix
    ]

def pca(matrix, k):
    num_features = len(matrix[0])
    standardized_matrix = standardize_data(matrix)
    cov_matrix = calculate_covariance(standardized_matrix)
    eigenvalues, eigenvectors = eigenvalues_and_eigenvectors(cov_matrix)
    sorted_indices = sorted(range(num_features), key=lambda k: eigenvalues, reverse=True)
    eigenvectors = [[eigenvectors[i] for j in sorted_indices] for i in range(num_features)]
    transformed_matrix = transform(standardized_matrix, eigenvectors, k)
    return transformed_matrix

if __name__ == "__main__":
    data = []
    for file in file_paths:
        attributes, dt = load_arff(file)
        for row in dt:
            for i in range(len(attributes)):
                attr_name, attr_type, attr_values = attributes[i]
                if attr_type == 'nominal':
                    nominal_mapping = {value: index for index, value in enumerate(attr_values)}
                    row[i] = nominal_mapping.get(row[i])
                elif attr_type == 'numeric':
                    try:
                        row[i] = float(row[i])
                    except Exception as e:
                        row[i] = 0
        data.extend(dt)

    num_components = 2
    transformed_data = pca(data, num_components)

    print(f"\nTransformed Data Size: {len(transformed_data)}")
    print("\nTransformed Data (after PCA):")
    for sample in transformed_data:
        print(sample)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Rieze\\Desktop\\2017.arff'