## Import libraries

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load data

In [7]:
data = pd.read_csv('/Users/A117870943/Documents/DT/Innovation/archive/AIDS_Classification_50000.csv')
print(data.shape)
data.head()

(50000, 23)


Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,preanti,race,gender,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,1073,1,37,79.46339,0,1,0,100,0,1,18,0,1,1,2,0,1,0,322,469,882,754,1
1,324,0,33,73.02314,0,1,0,90,0,1,224,0,1,1,3,1,1,1,168,575,1035,1525,1
2,495,1,43,69.47793,0,1,0,100,0,1,0,0,0,1,1,0,0,0,377,333,1147,1088,1
3,1201,3,42,89.15934,0,1,0,100,1,1,513,0,1,1,3,0,0,0,238,324,775,1019,1
4,934,0,37,137.46581,0,1,0,100,0,0,4,0,1,0,3,0,0,1,500,443,1601,849,0


Transform the name of the variables

### Prepare Training data

In [8]:
df = transform_input_data(data,['time'],'infected')
print(df.shape)
print('Distriburion:',df.label.value_counts()/df.shape[0])
df.head()

(50000, 22)
Distriburion: label
0    0.68988
1    0.31012
Name: count, dtype: float64


Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,label
0,1,37,79.46339,0,1,0,100,0,1,18,0,1,1,2,0,1,0,322,469,882,754,1
1,0,33,73.02314,0,1,0,90,0,1,224,0,1,1,3,1,1,1,168,575,1035,1525,1
2,1,43,69.47793,0,1,0,100,0,1,0,0,0,1,1,0,0,0,377,333,1147,1088,1
3,3,42,89.15934,0,1,0,100,1,1,513,0,1,1,3,0,0,0,238,324,775,1019,1
4,0,37,137.46581,0,1,0,100,0,0,4,0,1,0,3,0,0,1,500,443,1601,849,0


> In order to create silent sufferers we need to add noise for `Minority` class so that it act as silent sufferers

In [10]:
df.to_csv('/Users/A117870943/Documents/DT/Innovation/data/train/raw/train.csv',index=False)

In [64]:
# # multiple training datasets with different masking percentages
# noise_percentages = [0.1, 0.2, 0.3, 0.4, 0.5]
# create_noisy_datasets(df,noise_percentages)

In [22]:
flip_percentages = [0,0.1, 0.2, 0.3, 0.4, 0.5]
flipped_datasets = flip_labels_maintain_imbalance(df, 'label', flip_percentages)


# write the datasets
for i, percentage in enumerate(flip_percentages):
    flipped_datasets[i].to_csv(f"/Users/A117870943/Documents/DT/Innovation/data/train/raw/train_flipped_labels_{int(percentage*100)}.csv", index=False)

# Check results
for i, percentage in enumerate(flip_percentages):
    print(f"Flip Percentage: {percentage*100}%")
    print('Original Label Distribution:')
    print(df['label'].value_counts())
    print(df['label'].value_counts()/df.shape[0])
    print('Flipped Label Distribution:')
    print(flipped_datasets[i]['label_flip'].value_counts())
    print(flipped_datasets[i]['label_flip'].value_counts()/flipped_datasets[i].shape[0])
    print()


Flip Percentage: 0%
Original Label Distribution:
label
0    34494
1    15506
Name: count, dtype: int64
label
0    0.68988
1    0.31012
Name: count, dtype: float64
Flipped Label Distribution:
label_flip
0    34494
1    15506
Name: count, dtype: int64
label_flip
0    0.68988
1    0.31012
Name: count, dtype: float64

Flip Percentage: 10.0%
Original Label Distribution:
label
0    34494
1    15506
Name: count, dtype: int64
label
0    0.68988
1    0.31012
Name: count, dtype: float64
Flipped Label Distribution:
label_flip
0    31045
1    13956
Name: count, dtype: int64
label_flip
0    0.689874
1    0.310126
Name: count, dtype: float64

Flip Percentage: 20.0%
Original Label Distribution:
label
0    34494
1    15506
Name: count, dtype: int64
label
0    0.68988
1    0.31012
Name: count, dtype: float64
Flipped Label Distribution:
label_flip
0    27595
1    12405
Name: count, dtype: int64
label_flip
0    0.689875
1    0.310125
Name: count, dtype: float64

Flip Percentage: 30.0%
Original Label Dist

### Scaled data

In [18]:
raw_path = "/Users/A117870943/Documents/DT/Innovation/data/train/raw/"
scaled_path = "/Users/A117870943/Documents/DT/Innovation/data/train/scaled/"

In [27]:
file_paths = [
                raw_path+"train_flipped_labels_0.csv",
                raw_path+"train_flipped_labels_10.csv",
                raw_path+"train_flipped_labels_20.csv",
                raw_path+"train_flipped_labels_30.csv",
                raw_path+"train_flipped_labels_40.csv",
                raw_path+"train_flipped_labels_50.csv",
]

output_files = [
    scaled_path+"train_flipped_labels_0.csv",
    scaled_path+"train_flipped_labels_10.csv",
    scaled_path+"train_flipped_labels_20.csv",
    scaled_path+"train_flipped_labels_30.csv",
    scaled_path+"train_flipped_labels_40.csv",
    scaled_path+"train_flipped_labels_50.csv",
]

# Process each dataset and save the preprocessed version
for file_path, output_file in zip(file_paths, output_files):
    df = pd.read_csv(file_path)
    preprocess_and_save(df, 'label','label_flip', output_file)

print("Datasets have been preprocessed and saved.")

Datasets have been preprocessed and saved.


### Prepare Testing data

In [14]:
test = pd.read_csv('/Users/A117870943/Documents/DT/Innovation/archive/AIDS_Classification_5000.csv')
print(test.shape)
print('Distribution:', test.infected.value_counts()/test.shape[0])
test.head()

(5000, 23)
Distribution: infected
0    0.6842
1    0.3158
Name: count, dtype: float64


Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,preanti,race,gender,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,988,2,42,71.76131,0,0,0,100,0,1,911,0,1,1,3,0,0,0,290,408,1870,1671,0
1,413,3,30,72.67629,0,1,0,100,0,1,414,0,1,0,1,0,0,0,506,446,1372,523,0
2,1119,0,37,73.75514,0,1,0,90,0,1,810,1,1,1,3,0,0,0,499,542,609,1474,0
3,680,3,34,70.03769,0,0,0,100,0,0,0,0,1,0,1,0,1,0,313,478,780,987,0
4,1043,3,35,63.02845,0,0,0,100,0,1,3,0,1,0,1,0,1,0,337,761,698,1735,0


In [15]:
test = transform_input_data(test,['time'],'infected')
test.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,label
0,2,42,71.76131,0,0,0,100,0,1,911,0,1,1,3,0,0,0,290,408,1870,1671,0
1,3,30,72.67629,0,1,0,100,0,1,414,0,1,0,1,0,0,0,506,446,1372,523,0
2,0,37,73.75514,0,1,0,90,0,1,810,1,1,1,3,0,0,0,499,542,609,1474,0
3,3,34,70.03769,0,0,0,100,0,0,0,0,1,0,1,0,1,0,313,478,780,987,0
4,3,35,63.02845,0,0,0,100,0,1,3,0,1,0,1,0,1,0,337,761,698,1735,0


In [16]:
test.to_csv('/Users/A117870943/Documents/DT/Innovation/data/test/raw/test.csv',index=False)

### Scaling

In [32]:
output_file = "/Users/A117870943/Documents/DT/Innovation/data/test/scaled/test_scaled.csv"

In [34]:
preprocess_and_save_test(test, 'label', output_file)

# Functions

In [2]:
def transform_input_data(df,cols_to_drop,label_to_rename):
    
    # reqd transformation
    df.drop(cols_to_drop,axis=1,inplace=True)
    df.rename(columns={label_to_rename:'label'}, inplace=True)
    
    # 
    cols = df.columns[:-1]
    for idx,col in enumerate(cols):
        df.rename(columns={col:'f_'+str(idx)},inplace=True)
    
    return df

In [3]:
def flip_labels_maintain_imbalance(df, label_col, flip_percentages):
    original_label_counts = df[label_col].value_counts()
    original_imbalance_ratio = original_label_counts[0] / original_label_counts[1]

    results = []

    for percentage in flip_percentages:
        df_result = df.copy()
        df_result['label_flip'] = df_result[label_col]
        num_to_flip = int(percentage * original_label_counts[1])

        # Get indices of label 1
        label_1_indices = df_result[df_result[label_col] == 1].index

        # Randomly select indices to flip
        flip_indices = np.random.choice(label_1_indices, size=num_to_flip, replace=False)

        # Flip the selected labels
        df_result.loc[flip_indices, 'label_flip'] = 0

        # Calculate the number of label 0 to remove to maintain the imbalance ratio
        new_label_flip_counts = df_result['label_flip'].value_counts()
        num_to_remove = new_label_flip_counts[0] - int(original_imbalance_ratio * new_label_flip_counts[1])

        if num_to_remove > 0:
            # Get indices of label 0
            label_0_indices = df_result[df_result['label_flip'] == 0].index

            # Exclude the flipped indices from the label 0 indices
            remaining_label_0_indices = label_0_indices.difference(flip_indices)

            # Randomly select indices to remove
            remove_indices = np.random.choice(remaining_label_0_indices, size=num_to_remove, replace=False)

            # Remove the selected indices
            df_result = df_result.drop(remove_indices)

        results.append(df_result)

    return results

In [26]:
def preprocess_and_save(df, orig_label,flipped_label, output_file):
    # Separate features and labels
    X = df.drop(columns=[orig_label, flipped_label])
    y_orig = df[orig_label]
    y_flip = df[flipped_label]

    # Handle any missing values if needed (e.g., fill with mean)
    X = X.fillna(X.mean())

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Create a DataFrame for the scaled features
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    X_scaled_df[orig_label] = y_orig.values
    X_scaled_df[flipped_label] = y_flip.values

    # Save the preprocessed dataset
    X_scaled_df.to_csv(output_file, index=False)

In [28]:
def preprocess_and_save_test(df, label_col, output_file):
    # Separate features and labels
    X = df.drop(columns=[label_col])
    y = df[label_col]

    # Handle any missing values if needed (e.g., fill with mean)
    X = X.fillna(X.mean())

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Create a DataFrame for the scaled features
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    X_scaled_df[label_col] = y.values

    # Save the preprocessed dataset
    X_scaled_df.to_csv(output_file, index=False)

In [4]:
def flip_generator(item,prob):
    
    
    random_num = random.uniform(0, 1)
    if (item['label']==1) & (random_num>prob):
        item['label_flip']=0 
        
    return item

In [5]:
def create_noisy_datasets(df,noise_percentages):
    
    df_copy = df.copy()
    
    df_copy['label_flip'] = df_copy['label']
    
    for idx, perc in enumerate(noise_percentages):
        df_copy = df.copy()
        print(idx)
        print('Original Shape of the dataset:', df.shape)
        print('Original imbalance:',df_copy.label.value_counts()/df_copy.shape[0])
        df_copy = df_copy.apply(lambda x: flip_generator(x,perc), axis=1)
        print('Flipped Shape of the dataset:', df_copy.shape)
        print('Flipped dataset imbalance:',df_copy.label_flip.value_counts()/df_copy.shape[0])
        
        # logic to maintain the same imbalance after flipping
        new_sample_size_healthy= (df_copy.label.value_counts()[0]/df_copy.label.value_counts()[1])*(df_copy.label_flip.value_counts()[1])
        no_samples_drop = df_copy.label_flip.value_counts()[0]-new_sample_size_healthy

        # filter datasets where both labels are zero
        df_filtered_healthy = df_copy[(df_copy['label']==0) & (df_copy['label_flip']==0)]
        df_filtered_healthy = df_filtered_healthy.sample(int(new_sample_size_healthy))

        # filter unhealthy datasets
        df_filtered_unhealthy =  df_copy[(df_copy['label']!=0) & (df_copy['label_flip']!=0)]

        df_comb = pd.concat([df_filtered_healthy,df_filtered_unhealthy])
        df_comb = df_comb.sample(frac=1)
        print('Final Shape of the dataset:', df_comb.shape)
        print('Final Datasets after maintaining imbalance:',df_comb.label_flip.value_counts()/df_comb.shape[0])
        print("Dataset generated and saved successfully.")
        print('\n')
        percentage = perc*10
        df_comb.to_csv(f'train_noise_{percentage}.csv', index=False)

    df.to_csv(f'train.csv', index=False)        

In [6]:
def create_noisy_datasets_2(df,noise_percentages):
    
    df_copy = df.copy()

    
    for idx, perc in enumerate(noise_percentages):
        df_copy = df.copy()
        print(idx)
        print('Original Shape of the dataset:', df.shape)
        print('Original dist.:',df_copy.label.value_counts())
        print('Original imbalance:',df_copy.label.value_counts()/df_copy.shape[0])
        df_copy = df_copy.apply(lambda x: flip_generator(x,perc), axis=1)
        print('Flipped Shape of the dataset:', df_copy.shape)
        print('Flipped dist. :',df_copy.label_flip.value_counts())
        print('Flipped dataset imbalance:',df_copy.label_flip.value_counts()/df_copy.shape[0])
        
        # logic to maintain the same imbalance after flipping
        new_sample_size_healthy= (df_copy.label.value_counts()[0]/df_copy.label.value_counts()[1])*(df_copy.label_flip.value_counts()[1])
        
        flipped_samples = df_copy[(df_copy['label']!=0) & (df_copy['label_flip']==0)].shape[0]
        reqd_healthy_samples = new_sample_size_healthy-flipped_samples
        
        # filter datasets for pure healthy
        df_pure_healthy = df_copy[(df_copy['label']==0) & (df_copy['label_flip']==0)]
        df_pure_healthy_sample = df_pure_healthy.sample(int(reqd_healthy_samples))
        
        # filter datasets for silent sufferers 
        df_silent_sufferers = df_copy[(df_copy['label']!=0) & (df_copy['label_flip']==0)]
        
        # filter pure unhealthy 
        df_filtered_unhealthy =  df_copy[(df_copy['label']!=0) & (df_copy['label_flip']!=0)]
        
        # combine all datasets
        df_comb = pd.concat([df_pure_healthy_sample,df_silent_sufferers,df_filtered_unhealthy])
        df_comb = df_comb.sample(frac=1)
        
        print('Final Shape of the dataset:', df_comb.shape)
        print('Flipped dist. after maintaining imbalance:',df_copy.label_flip.value_counts())
        print('Final Datasets after maintaining imbalance:',df_comb.label_flip.value_counts()/df_comb.shape[0])
        print("Dataset generated and saved successfully.")
        print('\n')
        
        # Write datasets
        percentage = perc*10
        df_comb.to_csv(f'train_noise_{percentage}.csv', index=False)

    df.to_csv(f'train.csv', index=False)        