In [1]:
import numpy as np
import pandas as pd
import os

In [3]:
def check_files_in_subdirectories():
    # Define the target file names
    target_files = ["X_num_train.npy", "X_cat_train.npy", "y_train.npy"]

    # Walk through all subdirectories in the current directory
    for root, dirs, files in os.walk("."):
        for target_file in target_files:
            if target_file in files:
                file_path = os.path.join(root, target_file)
                try:
                    # Load the .npy file
                    data = np.load(file_path, allow_pickle=True)
                    
                    # Check if the data is numeric
                    if np.issubdtype(data.dtype, np.number):
                        # If numeric, check for NaN values
                        if np.isnan(data).any():
                            print(f"NaNs found in file: {file_path}")
                    else:
                        # If categorical, print unique values
                        unique_values = np.unique(data)
                        print(f"Unique values in file {file_path}: {unique_values}")
                except Exception as e:
                    # Catch any errors during file loading and processing
                    print(f"Error loading file {file_path}: {e}")

In [4]:
check_files_in_subdirectories()

Unique values in file ./abalone/X_cat_train.npy: ['F' 'I' 'M']
Unique values in file ./cardio/X_cat_train.npy: ['0' '1' '2' '3']
Unique values in file ./fb-comments/X_cat_train.npy: ['0' '1' '10' '100' '101' '105' '106' '11' '12' '13' '14' '15' '16' '17'
 '18' '19' '2' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '3' '30'
 '31' '32' '33' '34' '35' '36' '38' '39' '4' '40' '42' '44' '45' '46' '47'
 '49' '5' '50' '51' '54' '55' '56' '57' '58' '59' '6' '60' '61' '62' '63'
 '66' '67' '68' '72' '73' '75' '76' '77' '79' '8' '80' '81' '82' '83' '85'
 '87' '89' '9' '90' '91' '92' '93' '96']
Unique values in file ./buddy/X_cat_train.npy: ['0' '0.0' '1' '1.0' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2'
 '2.0' '3' '4' '5' '6' '7' '8' '9' 'Agouti' 'Apricot' 'Black'
 'Black Brindle' 'Black Smoke' 'Black Tabby' 'Black Tiger' 'Blue'
 'Blue Cream' 'Blue Merle' 'Blue Point' 'Blue Smoke' 'Blue Tabby'
 'Blue Tick' 'Blue Tiger' 'Brown' 'Brown Brindle' 'Brown Merle'
 'Brown Tabby' 'Brown Tige

In [2]:
df = pd.DataFrame(np.load("adult/X_cat_train.npy", allow_pickle=True))
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,Private,HS-grad,Never-married,Handlers-cleaners,Own-child,White,Female,United-States
1,Private,11th,Never-married,Other-service,Own-child,White,Male,El-Salvador
2,State-gov,Some-college,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
3,Private,Bachelors,Divorced,Tech-support,Not-in-family,White,Male,United-States
4,Self-emp-inc,Masters,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States


In [5]:
for i in range(8):
    print(df.iloc[:, i].unique())

['Private' 'State-gov' 'Self-emp-inc' 'Local-gov' 'Self-emp-not-inc'
 'Federal-gov' 'Without-pay']
['HS-grad' '11th' 'Some-college' 'Bachelors' 'Masters' '10th' '12th' '9th'
 '7th-8th' 'Doctorate' 'Assoc-acdm' 'Assoc-voc' '1st-4th' '5th-6th'
 'Prof-school' 'Preschool']
['Never-married' 'Married-civ-spouse' 'Divorced' 'Separated' 'Widowed'
 'Married-spouse-absent' 'Married-AF-spouse']
['Handlers-cleaners' 'Other-service' 'Craft-repair' 'Tech-support'
 'Prof-specialty' 'Adm-clerical' 'Sales' 'Exec-managerial'
 'Transport-moving' 'Protective-serv' 'Machine-op-inspct'
 'Farming-fishing' 'Priv-house-serv' 'Armed-Forces']
['Own-child' 'Husband' 'Not-in-family' 'Other-relative' 'Unmarried' 'Wife']
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
['Female' 'Male']
['United-States' 'El-Salvador' 'Puerto-Rico' 'South' 'Ecuador' 'Mexico'
 'Vietnam' 'Yugoslavia' 'England' 'Trinadad&Tobago' 'Columbia' 'Iran'
 'Jamaica' 'Poland' 'Philippines' 'Guatemala' 'Cuba' 'Greece' 'Italy'
 '

In [6]:
for i in range(8):
    if 'nan' in df.iloc[:, i].unique():
        print(f"NaNs found in column {i}")

NaNs found in column 0
NaNs found in column 3
NaNs found in column 7


In [7]:
df.replace('nan' , np.nan, inplace=True)

In [8]:

nan_rows = df[df.isna().any(axis=1)].index
nan_rows

Index([   13,    21,    70,    84,   113,   116,   142,   180,   205,   206,
       ...
       25873, 25878, 25898, 25928, 25932, 25936, 25957, 25963, 25984, 26001],
      dtype='int64', length=1919)

In [9]:
print(f"df shape before dropping NaNs: {df.shape}")
df.drop(nan_rows, inplace=True)
print(f"df shape after dropping NaNs: {df.shape}")

df shape before dropping NaNs: (26048, 8)
df shape after dropping NaNs: (24129, 8)


In [10]:

y_train = np.load("adult/y_train.npy", allow_pickle=True)
print(f"df shape before dropping NaNs: {y_train.shape}")
y_train = np.delete(y_train, nan_rows, axis=0)
print(f"df shape after dropping NaNs: {y_train.shape}")

df shape before dropping NaNs: (26048,)
df shape after dropping NaNs: (24129,)


In [11]:
array = df.to_numpy()

In [12]:
x_num = np.load("adult/X_num_train.npy", allow_pickle=True)
print(f"df shape before dropping NaNs: {x_num.shape}")
x_num = np.delete(x_num, nan_rows, axis=0)
print(f"df shape after dropping NaNs: {x_num.shape}")

df shape before dropping NaNs: (26048, 6)
df shape after dropping NaNs: (24129, 6)


In [13]:
np.save("adult/X_cat_train.npy", array)
np.save("adult/X_num_train.npy", x_num)
np.save("adult/y_train.npy", y_train)