# Missing Dataset 2

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import export_text, plot_tree

MissingData2Path = "../DataFiles/MissingData2.txt"
missing_data2 = pd.read_csv(MissingData2Path, sep=r'\s+', header=None)

# Display options for debugging
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

print(missing_data2.head())

missing_data2.replace(1.00000000000000e+99, np.nan, inplace=True) 

             0             1         2         3         4             5             6         7             8             9             10        11            12            13        14        15  \
0  6.887280e-01 -2.127199e-01  0.498783  0.836527  0.753066 -2.126668e-01  9.427240e-01 -0.452367 -7.710111e-02  1.000000e+99  1.000000e+99  0.612123  7.803668e-02  6.166293e-02 -0.080118  0.206314   
1  1.000000e+99  1.102416e-01 -0.219114 -0.007928 -0.126895  3.124277e-01  2.054785e-01  0.228858  1.000000e+99  1.524070e-01 -1.246021e-01 -0.134936  1.000000e+99 -4.521916e-01 -0.526221  0.003796   
2 -7.389290e-01 -1.099149e-01 -0.584726  0.184901 -0.124804 -1.822229e-01  1.069205e-01 -0.280529  1.000000e+99 -3.794978e-01 -4.406905e-01 -0.346389  5.204105e-01 -2.457859e-01 -0.101673 -0.029700   
3  2.455752e-01 -7.399092e-02 -0.317213 -0.238237 -0.234355 -4.551269e-01 -3.268785e-01 -0.493662 -1.230997e-01 -3.809401e-01 -2.760249e-01 -0.004638 -5.239579e-01  1.000000e+99 -0.278308 -0.17367

In [20]:
def supervised_imputation(dataframe, visualize=False):
    data = dataframe.copy()
    model = DecisionTreeRegressor()
    
    for col in data.columns[data.isnull().any()]:  # Iterate only through columns with missing values
        X = data.drop(columns=[col])  # Features
        y = data[col]  # Target
        
        # Separate rows with and without missing values
        X_train, y_train = X[~y.isnull()], y[~y.isnull()]
        X_missing = X[y.isnull()]
        
        # Train the Decision Tree Regressor
        model.fit(X_train, y_train)
        
        if visualize:
            # Print the decision tree rules
            tree_rules = export_text(model, feature_names=X.columns.astype(str))
            print(f"Decision Tree for column {col}:\n")
            print(tree_rules)
            
            # Visualize the tree
            plt.figure(figsize=(10, 6))
            plot_tree(model, feature_names=X.columns, filled=True, rounded=True)
            plt.title(f"Decision Tree for column {col}")
            plt.show()
        
        # Predict and fill missing values
        data.loc[y.isnull(), col] = model.predict(X_missing)
    
    return data

In [21]:
# Initial mean imputation to handle NaNs temporarily
mean_imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(mean_imputer.fit_transform(missing_data2), columns=missing_data2.columns)

# Perform supervised learning-based imputation
filled_data = supervised_imputation(data_imputed).round(2)
print("Missing Data 2 (AFTER)")
print(filled_data.head())

output_path = "../PredictionResults/MissingResults2.txt"
filled_data.to_csv(output_path, index=False, header=False)

Missing Data 2 (AFTER)
     0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25    26    27    28    29    30    31  \
0  0.69 -0.21  0.50  0.84  0.75 -0.21  0.94 -0.45 -0.08  0.17  0.02  0.61  0.08  0.06 -0.08  0.21  0.29  0.52  0.02 -0.12  0.07  0.01  0.17 -0.03  0.57  0.37  0.19  0.54  0.29 -0.07  0.26  0.49   
1  0.00  0.11 -0.22 -0.01 -0.13  0.31  0.21  0.23  0.27  0.15 -0.12 -0.13  0.33 -0.45 -0.53  0.00  0.10  0.25  0.22  0.06  0.13 -0.05 -0.02 -0.34  0.70 -0.16 -0.44  0.56 -0.21 -0.18  0.19 -0.40   
2 -0.74 -0.11 -0.58  0.18 -0.12 -0.18  0.11 -0.28  0.27 -0.38 -0.44 -0.35  0.52 -0.25 -0.10 -0.03  0.15  0.01 -0.17 -0.36 -0.09 -0.12 -0.28 -0.18 -0.53  0.16 -0.29 -0.46 -0.43 -0.67  0.10 -0.08   
3  0.25 -0.07 -0.32 -0.24 -0.23 -0.46 -0.33 -0.49 -0.12 -0.38 -0.28 -0.00 -0.52 -0.15 -0.28 -0.17 -0.08 -0.19 -0.28 -0.27 -0.19  0.07 -0.17 -0.10 -0.27  0.16 -0.25 -0.21 -0.80 -0.06  0.11  

Supervised Imputation with a Decision Tree is a good choice for Dataset 2 because:

- **Finds Patterns:** learns how features are connected to predict missing values in the genes and samples
- **Customized for Each Feature:** adjusts predictions based on the data
- **Works Well for 10% Missing Data:** handles moderate gaps effectively