In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


In [None]:
'''
1. Examine and Handle missing values (e.g., fill the missing value, add a corresponding label).
2. Handle non-numeric values (e.g. one-hot encoding, Boolean indicator).
3. Further processing (e.g. standardize features).
'''

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

def handle_missing_values():
    pass

def csv_convert(input_path, output_path):
    """
    将数据文件转换为 CSV 格式
    """
    try:
        # 读取数据文件
        df = pd.read_csv(input_path, header=None, names=columns, skipinitialspace=True)
        # 保存为 CSV 文件
        df.to_csv(output_path, index=False)
        print(f"成功将 {input_path} 转换为 {output_path}")
    except Exception as e:
        print(f"转换失败: {e}")

def missing_value_processing(input_path, output_path):
    """
    处理缺失值
    """
    # 需要避开第一行，因为它是列名
    try:
        # 读取数据文件
        df = pd.read_csv(input_path, header=None, names=columns, skipinitialspace=True, skiprows=1)
        # 将缺失值替换为 NaN
        df.replace('?', pd.NA, inplace=True)
        
        # 对于数值变量，使用均值填充缺失值
        numerical_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
        for col in numerical_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            mean_value = df[col].mean()
            df[col] = df[col].fillna(mean_value)

        # 对于分类变量，使用众数填充缺失值
        categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
        for col in categorical_columns:
            mode_value = df[col].mode()[0]
            #print(f"Mode value for {col}: {mode_value}")
            df[col] = df[col].fillna(mode_value)
        
        # 保存处理后的数据
        df.to_csv(output_path, index=False)
        print(f"成功处理 {input_path} 的缺失值并保存为 {output_path}")

    except Exception as e:
        print(f"处理{input_path}缺失值失败: {e}")

def data_type_conversion(input_path, output_path):
    """
    数据类型转换
    """
    try:
        # 读取数据文件
        df = pd.read_csv(input_path, header=None, names=columns, skipinitialspace=True, skiprows=1)
        
        # 对分类型变量进行转换
        categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
        for col in categorical_columns:
            # 1. 如果只有两种取值，则转换为布尔类型
            if df[col].nunique() == 2:
                df[col] = df[col].map({df[col].unique()[0]: 0, df[col].unique()[1]: 1})
            # 2. 否则，进行独热编码
            else:
                dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
                # 进行0-1编码而非True-False编码
                dummies = dummies.astype(int)
                df = pd.concat([df, dummies], axis=1)
                df.drop(col, axis=1, inplace=True)
        
        # 对数值变量进行标准化处理
        numerical_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
        for col in numerical_columns:
            df[col] = (df[col] - df[col].mean()) / df[col].std()
        
        # 保存处理后的数据
        df.to_csv(output_path, index=False)
        print(f"成功将 {input_path} 的数据类型转换并保存为 {output_path}")
    except Exception as e:
        print(f"数据类型转换失败: {e}")    

In [None]:
if __name__ == "__main__":
    # Load the dataset
    train_data_path = './raw/adult.data'
    test_data_path = './raw/adult.test'

    # 如果数据集存在，输出提示信息
    if os.path.exists(train_data_path) and os.path.exists(test_data_path):
        print("数据集路径已存在，开始处理数据集...")
    else:
        print("非法数据集路径，请检查路径合法性。")
        exit()

    # 读取数据集并转换成csv格式
    train_csv_path = './data/adult_train.csv'
    test_csv_path = './data/adult_test.csv'
    csv_convert(train_data_path, train_csv_path)
    csv_convert(test_data_path, test_csv_path)

    # 1.对于训练集和测试集，先进行缺失值处理
    # 此处缺失值表示为 "?"；对于数值变量，使用均值填充；对于分类变量，使用众数填充
    filled_train_csv_path = './data/adult_train_filled.csv'
    filled_test_csv_path = './data/adult_test_filled.csv'
    missing_value_processing(train_csv_path, filled_train_csv_path)
    missing_value_processing(test_csv_path, filled_test_csv_path)

    # 2. 对于训练集和测试集，进行数据预处理
    # 2.1 对于二元类变量，进行boolean编码；对于多元类变量，进行one-hot编码
    # 2.2 对于数值变量，进行标准化处理
    processed_train_csv_path = './data/adult_train_processed.csv'
    processed_test_csv_path = './data/adult_test_processed.csv'
    data_type_conversion(filled_train_csv_path, processed_train_csv_path)
    data_type_conversion(filled_test_csv_path, processed_test_csv_path)

In [None]:
'''
对数据进行可视化
1.Visualize high-dimensional data in a 2D and/or 3D space using t-SNE
2. Create a scatter plot of the resulting embedding, coloring points by class labels if applicable.
3. Analyze the visualization to identify patterns or clusters.
'''

'''
对数据进行可视化
1.Visualize high-dimensional data in a 2D and/or 3D space using t-SNE
2. Create a scatter plot of the resulting embedding, coloring points by class labels if applicable.
3. Analyze the visualization to identify patterns or clusters.
'''



def visualize_tsne_2d(data, labels=None, title='t-SNE Visualization (2D)', 
                     save_path=None, colormap='tab20', balance_classes=False,
                     point_size=30, alpha=0.5, legend_loc='best'):
    """
    优化后的2D t-SNE可视化函数，支持类别不平衡处理
    
    Parameters新增:
    balance_classes : bool, 是否进行类别平衡采样
    point_size : int/array, 数据点大小
    alpha : float, 透明度
    legend_loc : str, 图例位置
    """
    # 类别平衡处理
    if balance_classes and labels is not None:
        data, labels = _balance_classes(data, labels)

    # t-SNE降维
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1200)
    embedded_data = tsne.fit_transform(data)
    
    # 创建画布
    plt.figure(figsize=(10, 8))
    
    if labels is not None:
        # 按类别样本量倒序排列（先画多数类）
        unique_labels, counts = np.unique(labels, return_counts=True)
        sorted_labels = unique_labels[np.argsort(-counts)]
        
        # 创建离散颜色映射
        cmap = plt.get_cmap('Set1')  # 使用对比度更高的离散调色板
        colors = cmap(np.linspace(0, 1, len(unique_labels)))

        # 分层绘制各个类别
        for idx, label in enumerate(sorted_labels):
            mask = labels == label
            plt.scatter(embedded_data[mask, 0], embedded_data[mask, 1],
                        c=[colors[idx]], label=str(label),
                        s=point_size, alpha=alpha, edgecolors='w', linewidth=0.3)
            
        # 添加图例
        plt.legend(title='Class Labels', loc=legend_loc,
                  frameon=True, framealpha=0.8)
    else:
        plt.scatter(embedded_data[:, 0], embedded_data[:, 1],
                   s=point_size, alpha=alpha, edgecolors='w', linewidth=0.3)
    
    # 添加标签和网格
    plt.title(title, pad=20)
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.grid(alpha=0.3)
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=300, facecolor='white')
    
    plt.show()
    return embedded_data

def _balance_classes(data, labels):
    """内部使用的类别平衡函数"""
    unique_labels, counts = np.unique(labels, return_counts=True)
    min_count = np.min(counts)
    
    sampled_indices = []
    for label in unique_labels:
        indices = np.where(labels == label)[0]
        if len(indices) > min_count:
            indices = np.random.choice(indices, min_count, replace=False)
        sampled_indices.append(indices)
    
    sampled_indices = np.concatenate(sampled_indices)
    return data[sampled_indices], labels[sampled_indices]


def visualize_tsne_3d(data, labels=None, title='t-SNE Visualization (3D)', save_path=None, colormap='viridis', balance_classes=True):
    """
    Visualize high-dimensional data in 3D using t-SNE.
    
    Parameters:
    -----------
    data : array-like
        High-dimensional data to visualize.
    labels : array-like, optional
        Class labels for each data point.
    title : str, optional
        Title for the plot.
    save_path : str, optional
        Path to save the visualization.
    colormap : str, optional
        Colormap to use for the scatter plot.
    """
    if balance_classes and labels is not None:
        data, labels = _balance_classes(data, labels)
    
    # Apply t-SNE for dimensionality reduction to 3D
    tsne = TSNE(n_components=3, random_state=42, perplexity=30, max_iter=1200)
    embedded_data = tsne.fit_transform(data)
    
    # Create a 3D scatter plot
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    if labels is not None:
        # If labels are provided, color points by labels
        scatter = ax.scatter(embedded_data[:, 0], embedded_data[:, 1], embedded_data[:, 2], 
                  c=labels, cmap='Set1', alpha=0.5, s=10)
        plt.colorbar(scatter, label='Class Labels')
    else:
        # If no labels, use a single color
        ax.scatter(embedded_data[:, 0], embedded_data[:, 1], embedded_data[:, 2], 
                   alpha=0.8, s=50)
    
    ax.set_title(title)
    ax.set_xlabel('t-SNE Feature 1')
    ax.set_ylabel('t-SNE Feature 2')
    ax.set_zlabel('t-SNE Feature 3')
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
    
    plt.show()
    
    return embedded_data

def analyze_clusters(embedded_data, labels=None):
    """
    Analyze the t-SNE embedding to identify patterns or clusters.
    
    Parameters:
    -----------
    embedded_data : array-like
        Low-dimensional embedding from t-SNE.
    labels : array-like, optional
        Class labels for each data point.
    """
    if labels is None:
        print("No labels provided for cluster analysis.")
        return
    
    # Calculate cluster statistics
    unique_labels = np.unique(labels)
    print(f"Number of clusters/classes: {len(unique_labels)}")
    
    # Compute basic statistics for each cluster
    for label in unique_labels:
        cluster_points = embedded_data[labels == label]
        center = np.mean(cluster_points, axis=0)
        std_dev = np.std(cluster_points, axis=0)
        count = len(cluster_points)
        
        print(f"\nCluster/Class {label}:")
        print(f"  Number of points: {count}")
        print(f"  Center: {center}")
        print(f"  Standard deviation: {std_dev}")
        
    # Visual analysis with a pairplot if using 3D embedding
    if embedded_data.shape[1] >= 2:
        df = pd.DataFrame(embedded_data[:, :3], columns=[f'Component {i+1}' for i in range(min(3, embedded_data.shape[1]))])
        if labels is not None:
            df['Label'] = labels
            sns.pairplot(df, hue='Label')
            plt.suptitle('Pairwise Relationships Between t-SNE Components', y=1.02)
            plt.show()


In [None]:
if __name__ == "__main__":
    # Load data from the specified CSV files
    train_data = pd.read_csv('./data/adult_train_processed.csv')
    test_data = pd.read_csv('./data/adult_test_processed.csv')
    
    print(f"Loaded training data shape: {train_data.shape}")
    print(f"Loaded test data shape: {test_data.shape}")
    
    # Assuming the last column is the target/label
    X_train = train_data.iloc[:, :-7].values
    y_train = train_data.iloc[:, 7].values
    
    X_test = test_data.iloc[:, :7].values
    y_test = test_data.iloc[:, 7].values
    
    
    print("\n--- Training Data Visualization ---")
    print("Visualizing training data in 2D...")
    embedded_train_2d = visualize_tsne_2d(
        X_train, 
        y_train, 
        title='t-SNE Visualization of Adult Income Dataset - Training Data (2D)',
        save_path='./figures/tsne_train_2d.png',
        colormap='tab10'  # Using a discrete colormap better for categorical data
    )
    
    
    print("\nVisualizing training data in 3D...")
    embedded_train_3d = visualize_tsne_3d(
        X_train, 
        y_train, 
        title='t-SNE Visualization of Adult Income Dataset - Training Data (3D)',
        save_path='./figures/tsne_train_3d.png',
        colormap='tab10'  # Using a discrete colormap better for categorical data
    )
    print("\n--- Analyzing Clusters in Training Data ---")
    analyze_clusters(embedded_train_2d, y_train)
    analyze_clusters(embedded_train_3d, y_train)