In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)
n_samples = 1000
n_features = 20

X = np.random.randn(n_samples, n_features)
y = np.hstack((np.zeros(int(n_samples * 0.95)), np.ones(int(n_samples * 0.05))))

# Create unique identifiers
ids = np.arange(n_samples)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
df['label'] = y
df['id'] = ids

In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

test_set = test_df.reset_index(drop=True)

In [5]:
train_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,label,id
834,0.393378,-0.18055,-1.029655,-1.398082,2.597437,-1.129815,-0.518625,0.517,0.360861,0.520866,...,0.061344,-0.312131,0.24541,1.055366,-0.0222,-0.015547,1.068161,0.96136,0.0,834
73,-0.138456,-1.224298,-0.209023,-0.85052,-0.580523,0.588578,1.669905,0.394672,-1.195883,0.444603,...,-0.134017,0.014688,-0.784898,0.64828,-0.120948,0.419532,-0.887492,-0.437458,0.0,73
790,-1.47623,0.444875,0.642121,-0.759652,-1.023734,1.017174,-0.990861,-0.606664,0.887309,0.034741,...,-0.530011,-0.402757,0.255784,0.625077,1.961104,-1.660068,0.825952,0.74301,0.0,790
144,-2.238231,-2.1207,-0.606865,0.457687,-2.747505,-0.49973,-0.526248,1.388338,-0.385022,0.382989,...,0.768207,0.215397,0.508269,3.926238,-2.084113,1.724697,-0.287448,0.287329,0.0,144
169,2.165056,1.190549,0.212574,1.026986,1.1059,-0.563947,-0.816217,0.078143,0.861636,0.13906,...,-0.073973,-0.075666,1.972542,-1.385988,0.505589,1.489113,2.27145,-0.404397,0.0,169


In [4]:
# multiple training datasets with different masking percentages
masking_percentages = [5, 10, 25, 35, 50]

for percentage in masking_percentages:
    train_masked_df = train_df.copy()
    mask = np.random.rand(*train_masked_df.drop(columns=['label', 'id']).shape) > (percentage / 100.0)
    train_masked_df.iloc[:, :-2] = train_masked_df.iloc[:, :-2].where(mask, float('nan'))
    
    # Reset the index and save the dataset
    train_masked_df = train_masked_df.reset_index(drop=True)
    train_masked_df.to_csv(f'train_masked_{percentage}.csv', index=False)
    
# write the datasets
test_set = test_set.reset_index(drop=True)
test_set.to_csv('test_set.csv', index=False)

print("Datasets generated and saved successfully.")

Datasets generated and saved successfully.


In [6]:
train_masked_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,label,id
0,0.393378,,-1.029655,-1.398082,,-1.129815,-0.518625,0.517,0.360861,0.520866,...,,-0.312131,,,-0.0222,,1.068161,,0.0,834
1,,-1.224298,,-0.85052,-0.580523,0.588578,1.669905,0.394672,-1.195883,,...,-0.134017,0.014688,-0.784898,0.64828,,0.419532,-0.887492,,0.0,73
2,,0.444875,,,-1.023734,,-0.990861,,,0.034741,...,-0.530011,-0.402757,,0.625077,,,,,0.0,790
3,,-2.1207,-0.606865,,,-0.49973,,,,0.382989,...,0.768207,0.215397,0.508269,,-2.084113,1.724697,,0.287329,0.0,144
4,2.165056,1.190549,,1.026986,,,,,,,...,-0.073973,,,,,1.489113,,,0.0,169


In [7]:
import os
import pandas as pd

In [9]:
data = pd.read_csv('/Users/A117870943/Documents/DT/Innovation/archive/AIDS_Classification_15000.csv')

In [11]:
data.shape

(15000, 23)

In [10]:
data.head()

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,1108,1,37,88.11364,0,1,1,100,0,1,...,1,1,0,0,0,389,320,734,737,1
1,1079,0,43,66.77075,0,0,0,100,0,1,...,0,2,0,1,1,318,432,912,1213,0
2,492,1,34,82.91725,0,0,0,90,0,1,...,1,2,0,1,1,326,524,660,835,0
3,1191,1,41,98.91817,0,0,0,81,0,1,...,1,3,0,1,0,318,232,1131,982,1
4,1141,3,47,53.61717,0,1,0,100,0,0,...,0,1,0,1,0,280,337,515,679,0


In [12]:
data.columns

Index(['time', 'trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'treat', 'offtrt', 'cd40', 'cd420', 'cd80', 'cd820',
       'infected'],
      dtype='object')

In [13]:
data.rename(columns={'infected':'label'}, inplace=True)

In [15]:
cols = data.columns[1:-1]

In [18]:
for idx,col in enumerate(cols):
    data.rename(columns={col:'f_'+str(idx)},inplace=True)

In [19]:
data.head()

Unnamed: 0,time,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,...,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,label
0,1108,1,37,88.11364,0,1,1,100,0,1,...,1,1,0,0,0,389,320,734,737,1
1,1079,0,43,66.77075,0,0,0,100,0,1,...,0,2,0,1,1,318,432,912,1213,0
2,492,1,34,82.91725,0,0,0,90,0,1,...,1,2,0,1,1,326,524,660,835,0
3,1191,1,41,98.91817,0,0,0,81,0,1,...,1,3,0,1,0,318,232,1131,982,1
4,1141,3,47,53.61717,0,1,0,100,0,0,...,0,1,0,1,0,280,337,515,679,0


In [20]:
data.drop('time',axis=1,inplace=True)

In [31]:
data.label.value_counts()/data.shape[0]

label
0    0.691267
1    0.308733
Name: count, dtype: float64

In [23]:
import random

In [33]:
def flip_generator(item,prob):
    
    random_num = random.uniform(0, 1)
    if (item['label']==1) & (random_num>prob):
        item['label_flip']=0 
        
    return item

In [27]:
df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,label
0,1.0,37.0,88.11364,0.0,1.0,1.0,100.0,0.0,1.0,169.0,...,1.0,1.0,0.0,0.0,0.0,389.0,320.0,734.0,737.0,0.0
1,0.0,43.0,66.77075,0.0,0.0,0.0,100.0,0.0,1.0,654.0,...,0.0,2.0,0.0,1.0,1.0,318.0,432.0,912.0,1213.0,0.0
2,1.0,34.0,82.91725,0.0,0.0,0.0,90.0,0.0,1.0,710.0,...,1.0,2.0,0.0,1.0,1.0,326.0,524.0,660.0,835.0,0.0
3,1.0,41.0,98.91817,0.0,0.0,0.0,81.0,0.0,1.0,992.0,...,1.0,3.0,0.0,1.0,0.0,318.0,232.0,1131.0,982.0,1.0
4,3.0,47.0,53.61717,0.0,1.0,0.0,100.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,280.0,337.0,515.0,679.0,0.0


In [30]:
df.label.value_counts()/df.shape[0]

label
0.0    0.844933
1.0    0.155067
Name: count, dtype: float64

In [32]:
data['label_flip'] = data['label']

In [34]:
df = data.apply(lambda x: flip_generator(x,0.5), axis=1)

In [35]:
df.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,label,label_flip
0,1.0,37.0,88.11364,0.0,1.0,1.0,100.0,0.0,1.0,169.0,...,1.0,0.0,0.0,0.0,389.0,320.0,734.0,737.0,1.0,1.0
1,0.0,43.0,66.77075,0.0,0.0,0.0,100.0,0.0,1.0,654.0,...,2.0,0.0,1.0,1.0,318.0,432.0,912.0,1213.0,0.0,0.0
2,1.0,34.0,82.91725,0.0,0.0,0.0,90.0,0.0,1.0,710.0,...,2.0,0.0,1.0,1.0,326.0,524.0,660.0,835.0,0.0,0.0
3,1.0,41.0,98.91817,0.0,0.0,0.0,81.0,0.0,1.0,992.0,...,3.0,0.0,1.0,0.0,318.0,232.0,1131.0,982.0,1.0,0.0
4,3.0,47.0,53.61717,0.0,1.0,0.0,100.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,280.0,337.0,515.0,679.0,0.0,0.0


In [38]:
df.label.value_counts()/df.shape[0]

label
0.0    0.691267
1.0    0.308733
Name: count, dtype: float64

In [39]:
df.label_flip.value_counts()/df.shape[0]

label_flip
0.0    0.847467
1.0    0.152533
Name: count, dtype: float64

In [40]:
df.shape

(15000, 23)

In [44]:
df.label.value_counts()

label
0.0    10369
1.0     4631
Name: count, dtype: int64

In [41]:
df.label_flip.value_counts()

label_flip
0.0    12712
1.0     2288
Name: count, dtype: int64

In [51]:
new_sample_size_healthy= (df.label.value_counts()[0]/df.label.value_counts()[1])*df.label_flip.value_counts()[1]
new_sample_size_healthy

5122.926365795724

In [52]:
no_samples_drop = df.label_flip.value_counts()[0]-new_sample_size_healthy
no_samples_drop

7589.073634204276

In [53]:
df_filtered = df[(df['label']==0) & (df['label_flip']==0)]

In [80]:
df[(df['label']!=0) & (df['label_flip']==0)].shape

(2343, 23)

In [81]:
df[(df['label']==0) & (df['label_flip']==0)].shape

(10369, 23)

In [83]:
df[(df['label']==0) & (df['label_flip']==0)].shape[0] - df[(df['label']!=0) & (df['label_flip']==0)].shape[0]

8026

In [84]:
a = (df.label.value_counts()[0]/df.label.value_counts()[1])*df.label_flip.value_counts()[1]
b = a - df[(df['label']!=0) & (df['label_flip']==0)].shape[0]
b

2779.9263657957245

In [85]:
df[(df['label']!=0) & (df['label_flip']==0)].shape[0]+b

5122.926365795724

In [62]:
new_sample_size_healthy

5122.926365795724

In [63]:
df_filtered = df_filtered.sample(int(new_sample_size_healthy))
df_filtered.shape

(5122, 23)

In [67]:
df_filtered_unhealthy =  df[(df['label']!=0) & (df['label_flip']!=0)]
df_filtered_unhealthy.shape

(2288, 23)

In [68]:
df_comb = pd.concat([df_filtered,df_filtered_unhealthy])

In [69]:
df_comb.shape

(7410, 23)

In [70]:
df_comb.label_flip.value_counts()/df_comb.shape[0]

label_flip
0.0    0.691228
1.0    0.308772
Name: count, dtype: float64

In [76]:
df_comb.label.value_counts()

label
0.0    5122
1.0    2288
Name: count, dtype: int64

In [77]:
df_comb.label_flip.value_counts()

label_flip
0.0    5122
1.0    2288
Name: count, dtype: int64

In [72]:
df1 = data.apply(lambda x: flip_generator(x,0.2), axis=1)

In [73]:
new_sample_size_healthy1= (df1.label.value_counts()[0]/df1.label.value_counts()[1])*df1.label_flip.value_counts()[1]
new_sample_size_healthy1

2001.7028719499028

In [74]:
no_samples_drop1 = df1.label_flip.value_counts()[0]-new_sample_size_healthy1
no_samples_drop1

12104.297128050097