<a href="https://colab.research.google.com/github/bozorgpanah/The-Explainable-Machine-Learning-Model-withPrivacy/blob/main/Paper1/MDAV%2BSHAP_USAHousing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
MDAV
SHAP
ON
USAHousing
"""

In [None]:
#Uploading a dataset from local drive to colab
from google.colab import files
uploaded = files.upload()

Saving USA_Housing.csv to USA_Housing.csv


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
import io

##Reading dataset
df = pd.read_csv(io.BytesIO(uploaded['USA_Housing.csv'])) #Dataset's name == housing.csv
# Dataset is now stored in a Pandas Dataframe
df.info()


# now check for missing values in total bedrooms
df.isnull().sum()
# ------------------------------------------------------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [None]:
##Feature Vector and Target Variable
X = df[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population','Price']] #Feature Vector
y = df['Price'] #Target Variable

##Splitting the data into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print(X_train,y_train)
print("####################################################################################################")
print(X_test,y_test)

      Avg. Area Income  Avg. Area House Age  ...  Area Population         Price
2858       77947.46436             6.441789  ...      29660.79658  1.539129e+06
1559       69931.40110             6.099507  ...      52079.18818  1.389287e+06
1441       65841.30879             5.587065  ...      48253.27664  1.029862e+06
2179       73682.48621             4.611184  ...      34366.18010  7.660784e+05
1390       72813.80169             8.543764  ...      35014.84821  1.727007e+06
...                ...                  ...  ...              ...           ...
4931       77622.95812             6.738014  ...      51102.44195  1.599997e+06
3264       80051.84712             5.872678  ...      35254.12832  1.354609e+06
1653       67094.19707             5.346437  ...      30022.53717  1.202993e+06
2607       52541.31985             4.885243  ...      41258.26229  8.429859e+05
2732       86762.88286             6.530193  ...      47724.58136  1.571254e+06

[3500 rows x 6 columns] 2858    1.53912

In [None]:
#MDAV a microagregation method 
def dist(x,y):
    return np.linalg.norm(x-y)
    
#Stack arrays in sequence vertically (row wise)
def poprow(arr,i):
    pop = arr[i]
    new_array = np.vstack((arr[:i],arr[i+1:]))
    return new_array,pop

def cluster(X, p, k, dist_to_xr):
    c = [p]
    
    if dist_to_xr == None:
        distances = [dist(v[:-1],p[:-1]) for v in X]
    else:
        distances = dist_to_xr
        
    X = X[np.argpartition(distances, k-1)]
    c.extend(X[:k-1])
    X = X[k-1:]
    
    xc = np.array([p[:-1] for p in c], copy=False, ndmin=2)
    yc = np.array([p[-1] for p in c], copy=False)
    cl = (xc, yc)
    
    return X, cl

def mdav(X, y, k):
    D = np.column_stack((X,y)) #D is a matrix of variables
    clusters = []
    centroids = []
    while len(D) >= 3*k:
        # Centroid
        xm = np.mean(D, axis=0) #xm is an array includes mean in each variables (each columns)
        # Furthest from centroid
        xri = np.argmax([dist(v[:-1],xm[:-1]) for v in D])#Find furthest from the centroid in each vector and it's called xri
        D, xr = poprow(D, xri)
        # Furthest from furthest from centroid
        dist_to_xr = [dist(v[:-1],xr[:-1]) for v in D]
        xsi = np.argmax(dist_to_xr)
        dist_to_xr = dist_to_xr[:xsi]+dist_to_xr[xsi+1:]
        D, xs = poprow(D, xsi) 

        #cluster of xr
        D, c = cluster(D, xr, k, dist_to_xr)
        clusters.append(c)
        #cluster of xs
        D, c = cluster(D, xs, k, None)
        clusters.append(c)
        
    if len(D) >= 2*k and len(D) < 3*k:
        # Centroid
        xm = np.mean(D, axis=0)
        # Furthest from centroid
        xri = np.argmax([dist(v[:-1],xm[:-1]) for v in D])
        D, xr = poprow(D, xri)
        #cluster of xr
        D, c = cluster(D, xr, k, None)
        clusters.append(c)
        
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)     
    else:
        # rest of points
        xc = np.array([p[:-1] for p in D[:]], copy=False, ndmin=2)
        yc = np.array([p[-1] for p in D[:]], copy=False)
        cl = (xc, yc)
        clusters.append(cl)
    
    centroids = np.array([np.mean(c[0],axis=0) for c in clusters], copy=False)
    
    return clusters, centroids

In [None]:
#We use smaller K amount
import csv
import time
import statistics 
exec_times = []

"""
representativity = [0.01,0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09,0.1]

K = [int(len(X)*r) for r in representativity]#K = [int(len(X_train)*r) for r in representativity]

50    100.0
75    66.66666666666667
100    50.0
125    40.0
150    33.333333333333336
175    28.571428571428573
200    25.0
225    22.22222222222222
250    20.0
275    18.181818181818183
300    16.666666666666668
325    15.384615384615385
350    14.285714285714286
375    13.333333333333334
400    12.5
425    11.764705882352942
450    11.11111111111111


500    10.0
550    9.090909090909092
600    8.333333333333334
650    7.6923076923076925
700    7.142857142857143
750    6.666666666666667
800    6.25
850    5.882352941176471
900    5.555555555555555
950    5.2631578947368425
1000    5.0
"""
K = 5.0
i = 0
k = int(5000/K)
clusterings = []
centroids = []
maskedData = []
print(f'k = {k}')

 #for k in K:
start = time.time()
    #print(k, K)
clustering, centroid = mdav(X, y, k)#clustering, centroid = mdav(X_train, y_train, k)
clusterings.append(clustering)
centroids.append(centroid)


with open('clusterings_csv.csv','w',newline='') as fp:
   c = csv.writer(fp, delimiter = ',')    
   for line in clusterings:
    c.writerows(line)
################################################################################
j=0
with open('centroids_csv.csv','w',newline='') as f:
    fieldnames = ['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population','Price']
    z = csv.DictWriter(f, fieldnames=fieldnames)
    z.writeheader()
    z = csv.writer(f, delimiter = ',')
    for line in centroids:
      for j in range(0,k):
        z.writerows(line)

i = 0
while i<k:
 maskedData.append(centroid) 
 i += 1  

end = time.time()
exec_times.append(end-start)
    #arr_ctr_num.append(len(centroids))
print(f'Time for calculating = {exec_times}')

#Saveing centroids in a file
f_centroids = open("Centroids.txt","wt")
f_centroids.write("\n"+str(centroids)+"\n")
f_clusters = open("Clusers.txt","wt")
f_clusters.write("\n"+str(clusterings)+"\n")
f_maskedData = open("MaskedData.txt","wt")
f_maskedData.write("\n"+str(maskedData)+"\n")
f_maskedData.close()
f_centroids.close()
f_clusters.close()
#files.download("clusterings_csv.csv")
files.download("centroids_csv.csv")

k = 1000
Time for calculating = [0.28157782554626465]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#SHAP
!pip install shap #Install SHAP library
#Uploading a dataset from local drive to colab
#from google.colab import files
#uploaded = files.upload()

In [None]:
#Uploading a dataset from local drive to colab
from google.colab import files
uploaded = files.upload()

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
import io

##Reading dataset
df = pd.read_csv(io.BytesIO(uploaded['USA_Housing.csv'])) #Dataset's name == USA_Housing.csv OR == centroids_csv (ProtectedDataset)k=3.csv 
# Dataset is now stored in a Pandas Dataframe
df.info()

# now check for missing values in total bedrooms
df.isnull().sum()
# ------------------------------------------------------------------

In [None]:
##Feature Vector and Target Variable
X = df[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population']] #Feature Vector
y = df['Price'] #Target Variable

##Splitting the data into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

"""
print(X_train,y_train)
print("####################################################################################################")
print(X_test,y_test)
"""
##Building the model with --> Random Forest Classifier
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
model.fit(X_train, y_train)

##Generating Predictions
y_pred = model.predict(X_test)
print("Prediction: ", y_pred)

##Evaluating Performance
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)**(0.5)
print("mean_squared_error: ", mse)


In [None]:
##SHAP Explanation
#Force Plots
# import shap library
import shap 
from IPython.display import display

# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# visualize the first prediction's explanation 
shap.initjs()
display(shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:]))


In [None]:
shap.initjs()
# visualize the training set predictions
#shap.force_plot(explainer.expected_value, shap_values, X_train)


In [None]:
##SHAP Feature Importance
shap_values = shap.TreeExplainer(model).shap_values(X_train)

#x = shap_values.mean()
#print(x)
#print(shap_values)
shap.approximate_interactions(2,shap_values, X_train)
shap.summary_plot(shap_values, X_train, plot_type="bar")

##SHAP Summary Plot
shap.summary_plot(shap_values, X_train)
"""
##SHAP Dependence Plot

shap.dependence_plot('Avg. Area Income', shap_values, X_train)
shap.dependence_plot('Avg. Area House Age', shap_values, X_train)
shap.dependence_plot('Avg. Area Number of Rooms', shap_values, X_train)
shap.dependence_plot('Avg. Area Number of Bedrooms', shap_values, X_train)
shap.dependence_plot('Area Population', shap_values, X_train)
"""