In [7]:
import os
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler

def parse_arff(file_path):
    attributes = []
    data = []
    with open(file_path, 'r') as file:
        lines = file.read().splitlines()
        parsing_data = False
        for line in lines:
            line = line.strip()
            if not line or line.startswith('%'):
                continue
            elif line.lower().startswith('@attribute'):
                attribute_name = line.split()[1]
                attributes.append(attribute_name)
            elif line.lower().startswith('@data'):
                parsing_data = True
                break
        if parsing_data:
            for line in lines:
                line = line.strip()
                if line and not line.startswith('%'):
                    values = [value.strip() for value in line.split(',')]
                    numeric_values = []
                    for value in values:
                        try:
                            numeric_values.append(float(value))
                        except ValueError:
                            pass  # Ignore non-numeric values
                    if numeric_values:
                        data.append(numeric_values)
    return data, attributes

def list_files_in_folder(folder_path):
    files = [
        '2017 Q1.arff', '2017 Q2.arff', '2017 Q3.arff', '2017 Q4.arff', '2017.arff', 
        '2018 Q1.arff', '2018 Q2.arff', '2018 Q3.arff', '2018 Q4.arff', '2018.arff', 
        '2019 Q1.arff', '2019 Q2.arff', '2019 Q3.arff', '2019 Q4.arff', '2019.arff', 
        '2020 Q1.arff', '2020 Q2.arff', '2020 Q3.arff', '2020 Q4.arff', '2020.arff', 
        '2021 Q1.arff', 
    ]
    return files

def load_arff_folder(folder_path):
    all_data = []
    all_attributes = []
    folder_content = list_files_in_folder(folder_path)
    for file_name in folder_content:
        file_path = os.path.join(folder_path, file_name)
        data, attributes = parse_arff(file_path)

        max_length = max(len(row) for row in data)
        padded_data = [row + [0.0] * (max_length - len(row)) for row in data]

        all_data.append(padded_data)
        all_attributes.append(attributes)
    return all_data, all_attributes


folder_path = 'dataset'
all_data, all_attributes = load_arff_folder(folder_path)

# Assuming you have data loaded from ARFF files, no need for synthetic data
data = all_data[0]  # Adjust this according to your data structure

# Apply scikit-learn PCA
num_pca_components = 2
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

pca = PCA(n_components=num_pca_components)
pca_result = pca.fit_transform(data_scaled)

# Apply scikit-learn Truncated SVD
svd = TruncatedSVD(n_components=num_pca_components)
svd_result = svd.fit_transform(data_scaled)

np.set_printoptions(threshold=np.inf)

# Print the entire array

print("\nPCA Result:")
print(pca_result)
print("\nSVD Result:")
print(svd_result)




PCA Result:
[[-8.68611323e-01 -7.25802126e-02]
 [-8.25946584e-01 -4.35165862e-02]
 [-8.73892580e-01 -7.78088278e-02]
 [-8.62865145e-01 -7.32098159e-02]
 [-6.63506777e-01  2.30355972e-01]
 [-8.68723267e-01 -7.25565930e-02]
 [-8.66996643e-01 -7.24535177e-02]
 [-7.13182702e-01 -2.43063725e-01]
 [-8.74576960e-01 -4.40151895e-02]
 [-8.68619980e-01 -7.25718013e-02]
 [-8.68495788e-01 -7.25667489e-02]
 [-8.69037382e-01 -7.25351080e-02]
 [-8.68419768e-01 -7.26498276e-02]
 [-8.68575565e-01 -7.25570150e-02]
 [-8.60177292e-01 -7.21584944e-02]
 [-8.68620950e-01 -7.25595823e-02]
 [-8.78604230e-01 -8.48880724e-02]
 [-8.68600450e-01 -7.25781215e-02]
 [-8.68550550e-01 -7.25577798e-02]
 [-8.39934515e-01 -1.02053143e-01]
 [-8.68458267e-01 -7.25261618e-02]
 [-3.85585776e-01 -1.52835189e-01]
 [-8.65661993e-01 -7.35716365e-02]
 [-8.60199813e-01 -7.20233710e-02]
 [-1.18877577e-01 -9.50158140e-02]
 [-8.68553524e-01 -7.25841837e-02]
 [-8.67193669e-01 -7.26466020e-02]
 [-8.67527534e-01 -7.24212024e-02]
 [-8.68