In [None]:
import pandas as pd
path1 = 'Datasets/communities.data'
path2 = 'Datasets/communities.names'
headers = []
with open(path2, 'r') as file:
    for line in file:
        if line.startswith('@attribute'):
            name = line.split()[1]
            headers.append(name)
print(headers)
data = pd.read_csv(path1, sep=',', names=headers)
data.drop('state', axis=1, inplace=True)
data.drop('county', axis=1, inplace=True)
data.drop('community', axis=1, inplace=True)
data.drop('communityname', axis=1, inplace=True)
data.drop('fold', axis=1, inplace=True)
data.replace('?', pd.NA, inplace=True)
data=data.dropna(subset=['OtherPerCap'])
na_percentage = data.isna().mean()
high_na_columns = na_percentage[na_percentage > 0].index.tolist()
high_na_columns

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import warnings

warnings.filterwarnings("ignore")

X_column = data.dropna()
X_column = X_column.drop(high_na_columns, axis=1)
X_column = X_column.values.astype(np.float32)

columns_grids = {}
best_preds = {}
i = 1
for column in high_na_columns:
    y_column = data.loc[:, data.columns == column]
    y_column = y_column.dropna()
    y_column = y_column.values.astype(np.float32)

    X_train_column, X_test_column, y_train_column, y_test_column = train_test_split(X_column, y_column, test_size=0.25,
                                                                                    random_state=42)

    estimators = [KNeighborsRegressor()  , xgb.XGBRegressor()
        , RandomForestRegressor(), DecisionTreeRegressor(), Ridge()]

    print(i, "\t", column)
    grids = {}
    preds = {}
    for estimator in estimators:
        hyperparam_space = {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson_loss'],
            'max_depth': [1, 5, 10, 15, 20, 25, 30, 35, 100, 110, 190, 200, 210, None]
        }

        if estimator.__class__ in [KNeighborsRegressor().__class__, Ridge().__class__, ]:
            hyperparam_space = {

            }

        gridsearch_column: GridSearchCV = GridSearchCV(
            estimator,
            hyperparam_space,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose = 0
        )

        gridsearch_column.fit(X_train_column, y_train_column)

        model_column = gridsearch_column.best_estimator_
        y_pred_column = model_column.predict(X_test_column)
        mse_column = mean_squared_error(y_test_column, y_pred_column)
        mae_column = mean_absolute_error(y_test_column, y_pred_column)
        r2_column = r2_score(y_test_column, y_pred_column)

        preds[gridsearch_column] = y_pred_column
        grids[mse_column] = gridsearch_column
    best_mse = min(grids.keys())
    best_grid = grids[best_mse]
    y_pred_column = preds[best_grid]
    best_preds[column] = y_pred_column
    columns_grids[column] = (best_grid, best_mse)
    print(i, "\t", column)
    print(str(best_grid.best_estimator_.__class__))
    print('Best parameters:', best_grid.best_params_)
    mse_column =  mean_squared_error(y_test_column, y_pred_column)
    mae_column = mean_absolute_error(y_test_column, y_pred_column)
    r2_column  =            r2_score(y_test_column, y_pred_column)
    print(f"MSE: {mse_column}")
    print(f"MAE: {mae_column}")
    print(f"R^2 score: {r2_column}")
    i += 1

In [None]:
from copy import copy
good_enough = {column: grid for column, (grid, mse) in columns_grids.items() if mse < 0.2}
column_na_model = {column: None for column in high_na_columns}
column_na_model.update(good_enough)

X_column_na = data.drop(high_na_columns, axis=1)
X_column_na = X_column_na.dropna()
X_column_na = X_column_na.values.astype(np.float32)

data_fillna = copy(data)
for column, grid in column_na_model.items():
    print(column)
    if grid is not None:
        y_column = data.loc[:, data.columns == column]
        y_column = y_column.dropna()
        y_column = y_column.values.astype(np.float32)
        grid.best_estimator_.fit(X_column, y_column)

In [None]:
for column, model in column_na_model.items():
    if column in data_fillna.columns:
        if model is not None:
            rows_with_na = data_fillna[data_fillna[column].isna()]
            for index, row in rows_with_na.iterrows():
                features = row.drop(high_na_columns)
                predicted_value = model.predict(features.values.reshape(1, -1))
                data_fillna.at[index, column] = predicted_value
        else:
            data_fillna = data_fillna.drop(column, axis=1)

In [None]:
data_fillna.to_csv('data_fillna.csv', index=False)
data_fillna.head(10)

In [None]:
import pandas as pd
path_fillna = 'data_fillna.csv'
data_fillna = pd.read_csv(path_fillna, sep=',')
data_fillna

In [None]:
def convert_to_float(value):
    if isinstance(value, str):
        if "[[" in value and "]]" in value:
            return value[2:-2]
    else:
        return value
data_fillna=data_fillna.dropna()
data_fillna=data_fillna.applymap(convert_to_float)
data_fillna=data_fillna.astype(np.float32)
data_fillna

In [None]:
import numpy as np

X_fillna = data_fillna.iloc[:, :-1].values.astype(np.float32)
y_fillna = data_fillna.iloc[:, -1].values.astype(np.float32)

from sklearn.model_selection import train_test_split

X_train_fillna, X_test_fillna, y_train_fillna, y_test_fillna = train_test_split(X_fillna, y_fillna, test_size=0.25, random_state=42)
X_train_fillna

In [None]:
import tensorflow as tf

model_fillna = tf.keras.Sequential([
    tf.keras.layers.Dense(X_train_fillna.shape[0], input_shape=(X_train_fillna.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(int(X_train_fillna.shape[0] * 1 / 2)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)#,
   
])

from keras.optimizers.schedules import ExponentialDecay

initial_learning_rate = 0.0001
lr_schedule = ExponentialDecay(
    initial_learning_rate, decay_steps=10000, decay_rate=0.9, staircase=True
)

model_fillna

In [None]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=lr_schedule)
model_fillna.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
model_fillna.fit(X_train_fillna, y_train_fillna, epochs=3000, batch_size=X_train_fillna.shape[0],
                 validation_data=(X_test_fillna, y_test_fillna))

In [None]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=lr_schedule)
model_fillna.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
model_fillna.fit(X_train_fillna, y_train_fillna, epochs=3000, batch_size=int(X_train_fillna.shape[0]/2),
                 validation_data=(X_test_fillna, y_test_fillna))

In [None]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=lr_schedule)
model_fillna.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
model_fillna.fit(X_train_fillna, y_train_fillna, epochs=3000, batch_size=int(X_train_fillna.shape[0]/5),
                 validation_data=(X_test_fillna, y_test_fillna))

In [None]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=lr_schedule)
model_fillna.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
model_fillna.fit(X_train_fillna, y_train_fillna, epochs=3000, batch_size=int(X_train_fillna.shape[0]/10),
                 validation_data=(X_test_fillna, y_test_fillna))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred_fillna = model_fillna.predict(X_test_fillna)
mse_fillna = mean_squared_error(y_test_fillna, y_pred_fillna)
mae_fillna = mean_absolute_error(y_test_fillna, y_pred_fillna)
r2_fillna = r2_score(y_test_fillna, y_pred_fillna)

print(y_pred_fillna)
print(y_test_fillna)
print("\n")
print(f"MSE: {mse_fillna}")
print(f"MAE: {mae_fillna}")
print(f"R^2 score: {r2_fillna}")