In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, precision_score, f1_score, mean_absolute_error

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.feature_selection import SelectKBest, f_regression


In [2]:
df = pd.read_csv(r"C:\Users\asus\Desktop\E-Health methods and applications\Project\dataset_project_eHealth20232024.csv")

columns_to_drop = df.columns[5:42]  # Columns 5 to 41

df = df.drop(columns=columns_to_drop)
print(df.info())

df = df.drop_duplicates()  # Remove duplicate rows

print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        159 non-null    float64
 1   gender     160 non-null    int64  
 2   education  159 non-null    float64
 3   marital    160 non-null    int64  
 4   income     160 non-null    int64  
 5   ccs_1      158 non-null    float64
 6   ccs_2      160 non-null    int64  
 7   ccs_3      159 non-null    float64
 8   ccs_4      156 non-null    float64
 9   ccs_5      158 non-null    float64
 10  ccs_6      157 non-null    float64
 11  ccs_7      160 non-null    int64  
 12  ccs_8      160 non-null    int64  
 13  ccs_9      160 non-null    int64  
 14  ccs_10     160 non-null    int64  
 15  ccs_11     160 non-null    int64  
 16  ccs_12     160 non-null    int64  
dtypes: float64(7), int64(10)
memory usage: 21.4 KB
None
              age      gender   education     marital       income  \
count  1

In [3]:
def reverse_score(string):
    # print(df[string])
    df[string] = df[string] - 3
    df[string] = - df[string]
    df[string] = df[string] + 3
    # print(df[string])

reverse_score('ccs_3')
reverse_score('ccs_6')
reverse_score('ccs_7')
reverse_score('ccs_12')

data_ccs = pd.DataFrame()
data_ccs['Trend Skepticism'] = df[['ccs_1', 'ccs_7', 'ccs_11']].mean(axis=1)
data_ccs['Attribution Skepticism'] = df[['ccs_2', 'ccs_6', 'ccs_9']].mean(axis=1)
data_ccs['Impact Skepticism'] = df[['ccs_3', 'ccs_5', 'ccs_12']].mean(axis=1)
data_ccs['Response Skepticism'] = df[['ccs_4', 'ccs_8', 'ccs_10']].mean(axis=1)
#print (data_ccs)

In [4]:
nan_values  = df.isna().sum()
print("NaN Values in Each Column:")
print(nan_values[nan_values>0])
print("total number of NaN:", sum(nan_values[nan_values>0]))


NaN Values in Each Column:
age          1
education    1
ccs_1        2
ccs_3        1
ccs_4        4
ccs_5        2
ccs_6        3
dtype: int64
total number of NaN: 14


In [8]:
columns_with_missing = df.columns[df.isna().any()].tolist()

for col in columns_with_missing:
  # Split the dataset
  columns_with_missing = df.columns[df.isna().any()].tolist()
  df_missing = df[df[columns_with_missing].isna().any(axis=1)]
  df_no_missing = df.dropna(subset=columns_with_missing)
  X = df_no_missing.drop(columns=columns_with_missing)
  y = df_no_missing[col]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Scaling only X values
  std_scaler = preprocessing.StandardScaler()
  X_train_std_scaled = std_scaler.fit_transform(X_train)
  X_test_std_scaled = std_scaler.transform(X_test)

  # Training and Testing
  base1 = SVR(kernel='linear')
  base2 = SVR(kernel='rbf')
  base3 = DecisionTreeRegressor(max_depth=3)
  base4 = KNeighborsRegressor(n_neighbors=5)

  ada_regressor = AdaBoostRegressor()
  parameters = {
      'n_estimators': [50, 100, 200],
      'learning_rate': [0.01, 0.1, 0.5, 1],
      'estimator': [base1,base2,base3,base4],
      'loss': ['linear', 'exponential', 'square'],
      "random_state" : [42]
  }

  # GRIDSEARCH
  gs = GridSearchCV(ada_regressor, parameters, cv=3, verbose=0, n_jobs=-1)
  gs = gs.fit(X_train_std_scaled, y_train)

  best_model = gs.best_estimator_
  y_pred = best_model.predict(X_test_std_scaled)
  y_pred_train = best_model.predict(X_train_std_scaled)

  # results
  mae_train = metrics.mean_absolute_error(y_train, y_pred_train)
  mae_test = metrics.mean_absolute_error(y_test, y_pred)

  mse_train = mean_squared_error(y_train, y_pred_train)
  mse_test = mean_squared_error(y_test, y_pred)

  rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
  rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

  r2_train = r2_score(y_train, y_pred_train)
  r2_test = r2_score(y_test, y_pred)

  print("results for column", col)
  print("Best Score: %f using %s" % (gs.best_score_, gs.best_params_))
  print("MAE on Train Set: %.3f   Test Set: %.3f" % (mae_train, mae_test))
  print("MSE on Train Set: %.3f   Test Set: %.3f" % (mse_train, mse_test))
  print("RMSE on Train Set: %.3f   Test Set: %.3f" % (rmse_train, rmse_test))
  print("R^2 on Train Set: %.3f   Test Set: %.3f" % (r2_train, r2_test))
  print("")

  # Prediction
  missing_values_predictions = best_model.predict(df_missing.drop(columns=columns_with_missing))

  # Replace the missing values in the original DataFrame with the predictions
  df.loc[df_missing.index, col] = missing_values_predictions


results for column age
Best Score: 0.489808 using {'estimator': DecisionTreeRegressor(max_depth=3), 'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 50, 'random_state': 42}
MAE on Train Set: 4.182   Test Set: 5.116
MSE on Train Set: 24.578   Test Set: 39.748
RMSE on Train Set: 4.958   Test Set: 6.305
R^2 on Train Set: 0.796   Test Set: 0.594





results for column education
Best Score: 0.323100 using {'estimator': DecisionTreeRegressor(max_depth=3), 'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 100, 'random_state': 42}
MAE on Train Set: 2.566   Test Set: 3.567
MSE on Train Set: 10.663   Test Set: 19.245
RMSE on Train Set: 3.265   Test Set: 4.387
R^2 on Train Set: 0.666   Test Set: -0.129





results for column ccs_1
Best Score: 0.414682 using {'estimator': SVR(kernel='linear'), 'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 200, 'random_state': 42}
MAE on Train Set: 0.990   Test Set: 1.406
MSE on Train Set: 1.734   Test Set: 2.600
RMSE on Train Set: 1.317   Test Set: 1.612
R^2 on Train Set: 0.600   Test Set: 0.530





results for column ccs_3
Best Score: 0.555635 using {'estimator': DecisionTreeRegressor(max_depth=3), 'learning_rate': 0.01, 'loss': 'square', 'n_estimators': 50, 'random_state': 42}
MAE on Train Set: 0.644   Test Set: 1.044
MSE on Train Set: 0.557   Test Set: 2.084
RMSE on Train Set: 0.746   Test Set: 1.444
R^2 on Train Set: 0.830   Test Set: 0.437





results for column ccs_4
Best Score: 0.733243 using {'estimator': SVR(kernel='linear'), 'learning_rate': 0.1, 'loss': 'linear', 'n_estimators': 100, 'random_state': 42}
MAE on Train Set: 0.791   Test Set: 0.911
MSE on Train Set: 0.883   Test Set: 1.280
RMSE on Train Set: 0.940   Test Set: 1.131
R^2 on Train Set: 0.793   Test Set: 0.681





results for column ccs_5
Best Score: 0.426104 using {'estimator': KNeighborsRegressor(), 'learning_rate': 0.01, 'loss': 'square', 'n_estimators': 100, 'random_state': 42}
MAE on Train Set: 0.850   Test Set: 1.207
MSE on Train Set: 1.311   Test Set: 2.423
RMSE on Train Set: 1.145   Test Set: 1.557
R^2 on Train Set: 0.642   Test Set: 0.418





results for column ccs_6
Best Score: 0.538814 using {'estimator': SVR(kernel='linear'), 'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 50, 'random_state': 42}
MAE on Train Set: 0.937   Test Set: 1.313
MSE on Train Set: 1.633   Test Set: 2.804
RMSE on Train Set: 1.278   Test Set: 1.675
R^2 on Train Set: 0.642   Test Set: 0.390





In [9]:
# i'm verifying that i have predicted all the columns
nan_values  = df.isna().sum()
print("NaN Values in Each Column:")
print(nan_values[nan_values>0])

NaN Values in Each Column:
Series([], dtype: int64)
