In [1]:
import numpy as np
import pandas as pd
import math
import statistics
import matplotlib.pyplot as plt
from transformers import pipeline
from sklearn.model_selection import train_test_split, cross_val_score,KFold, cross_val_predict, GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier


In [2]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

## Download All Policies - Train Set

In [3]:
##Privacy Policies download:
websites = ['Google','Aws','AliExpress','Meta','TikTok','YouTube','Waze','Wix','Bookings','whatsapp'
            ,'apple','wolt','Visa','Mastercard','AirBNB','uber','Spotify','samsung','Wordpress','instagram'
            ,'MacDonalds','FDA','Oracle','Zara','cocacola','Xiaomi','Nasdaq','Walmart'
           ,'AirCanada','Lufthansa','shopify','Netflix','adobe','Starbucks','Shoppers','Decathlon','waltdisney'
            ,'AmericanEagle','lululemon','SAP','JetBrains','MySQLCode','Cadens','EpicGames'
            ,'unitedHealthGroup','Slack','SalesForce','JPMorgan','JohnsonAndJohnson']

sequence_to_classify = []

for i in websites:
    try:
        with open((i+".txt"), "r") as f:
            i = f.readlines()
            sequence_to_classify.append(i)
    except:
        with open((i+".txt"), "r", encoding='cp1252') as f:
            i = f.readlines()
            sequence_to_classify.append(i)

### Train Set Vectors:

In [4]:
policy_change_train = [1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,1,1,1,1,1,0,1,0,1,0,1,1,1,1,1,0,0]
third_party_train = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1]
first_party_use_train = [1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
user_edit_data_train = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,0,1]
data_collection_train = [[1,2,3,4,5,6,7,8],[1,2,3,4,5,6],[1,3,5,6,7],[1,3,7],[1,2,3,4,6,7]
                         ,[1,2,3,4,5,6,7],[1,4,5,6,7],[1,3,5,6,7],[1,2,3,5,6,7],[1,2,3,4,5,6,7]
                         ,[1,2,3,5,6,7],[1,3,5,6,7],[1,3,7],[1,2,3,7],[1,3,7]
                         ,[1,2,5,6],[1,2,3,4,5,6,7],[1,2,3,4,5,6,7],[1,3,5,7],[1,2,3,6]
                         ,[1,3,6,7],[1,2,3,5,6,7,8],[1,2,3,4,5,6,7],[1,3,5,6,7],[1,3,5,6,7,8]
                         ,[1,3,5,6,7],[1,2,3,4,5,6,7,8],[1,2,3,4,5,6,7],[1,3,5,6],[1,2,3,5,6,7,8]
                         ,[1,2,3,5,6,7,8],[1,2,3,4,5,6,7,8],[1,2,3,5,6,7,8],[1,2,3,4,5,6,7],[1,3,4,5,6,7]
                         ,[1,3,5,7],[1,2,3,4,5,6,7,8],[1,3,7],[1,2,3,4,5,6,7,8],[1,2,3,4,5,6,7]
                         ,[1,2,3,5,7,8],[1,3,7],[1,3,5,7,8],[1,3,4,5,6,7,8],[1,2,3,6,7]
                         ,[1,3,5,6,7,8],[1,2,3,4,5,6,7],[1,3,5,6,7,8],[1,3,5,6,7]]
data_security_train = [[1,3],[1,3,4],[3],[2],[3],[3],[3],[3,4],[1],[3],[1],[1],[1],[1],[1],[2],[1],[1],[3],[2]
                       ,[3],[1],[1],[1],[1],[1,2],[1],[1],[1],[2],[1],[3],[2],[2],[3],[1],[3],[3],[3],[1,2]
                       ,[1,2],[3],[3],[3],[3],[2],[2],[3],[3]]



### Privacy Policy Parameter

In [5]:
# ##Privacy Policy Change Matrix:
# matrix_policy_change_Comparison = pd.DataFrame(columns = ['Phrase',
#                                                'Google','Aws','AliExpress','Meta','TikTok','YouTube','Waze','Wix','Bookings','whatsapp'
#             ,'apple','wolt','Visa','Mastercard','AirBNB','uber','Spotify','samsung','Wordpress','instagram'
#             ,'MacDonalds','FDA','Oracle','Zara','cocacola','Xiaomi','Nasdaq','Walmart'
#            ,'AirCanada','Lufthansa','shopify','Netflix','adobe','Starbucks','Shoppers','Decathlon','waltdisney'
#             ,'AmericanEagle','lululemon','SAP','JetBrains','MySQLCode','Cadens','EpicGames'
#             ,'unitedHealthGroup','Slack','SalesForce','JPMorgan','JohnsonAndJohnson'])
# matrix_policy_change_Comparison['Phrase'] = ['Privacy Policy Change']

# ##Filling the matrix:
# websites_index = 0
# line = 0
# pred = 0
# policy_change_train_pred_Comparison = []
# c = ['We update this Privacy Policy without prior notice','you will be notify for policy changes']

# for j in sequence_to_classify:
#     a = classifier(str(j),c)
#     if a["labels"][0]==c[0] and a["scores"][0]>0.8:
#         pred = 0
#     else:
#         pred = 1  
#     matrix_policy_change_Comparison[websites[websites_index]].loc[line] = pred
#     policy_change_train_pred_Comparison.append(pred)
#     websites_index += 1
#     pred = 0

# matrix_policy_change_Comparison


In [6]:
##Privacy Policy Change Matrix:
Phrases = ['Privacy Policy'
           ,'we will update on privacy policy change'
           ,'We may update this Privacy Policy from time to time and without prior notice to you'
           ,'you will be notify for policy changes'
           ,'we will tell you on privacy policy change'
           ,'we will not update you on privacy policy change'
           ,'in case of privacy policy change you will be notify'
           ,'in case of privacy policy change you will not be notify']

##Matrix creation:
matrix_policy_change_ML = pd.DataFrame(columns = Phrases)
matrix_policy_change_ML['Privacy Policy'] = websites
matrix_policy_change_ML.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_policy_change_ML:
        matrix_policy_change_ML.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_policy_change_ML.iloc[:35,:] 
X_test = matrix_policy_change_ML.iloc[34:,:]
y_train = policy_change_train[:35]
y_test = policy_change_train[34:]


##Decission Tree Classifaier
pred_tree = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_list=[accuracy_score(y_test, pred_tree)
                    ,accuracy_score(y_test, pred_forest)
                    ,accuracy_score(y_test, pred_clf)
                    ,accuracy_score(y_test, pred_SVM)
                    ,accuracy_score(y_test, pred_KNN)
                    ,accuracy_score(y_test, pred_neural_network)]

pred_list_max = pred_list.index(max(pred_list))

if pred_list_max == 0:
    policy_change_ML = pred_tree
elif pred_list_max == 1:
    policy_change_ML = pred_forest
elif pred_list_max == 2:
    policy_change_ML = pred_clf
elif pred_list_max == 3:
    policy_change_ML = pred_SVM
elif pred_list_max == 4:
    policy_change_ML = pred_KNN
else:
    policy_change_ML = pred_neural_network

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


###  First Party Use Parameter

In [7]:
##first_party_use Matrix:
Phrases = ['Privacy Policy'
           ,'the user information is being saved'
           ,'We use your personal information'
           ,'We collect and use your personal information'
           ,'We will use the information we collect'
           ,'We will use the information you provide']

##Matrix creation:
matrix_first_party_ML = pd.DataFrame(columns = Phrases)
matrix_first_party_ML['Privacy Policy'] = websites
matrix_first_party_ML.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_first_party_ML:
        matrix_first_party_ML.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_first_party_ML.iloc[:35,:] 
X_test = matrix_first_party_ML.iloc[34:,:]
y_train = first_party_use_train[:35]
y_test = first_party_use_train[34:]


##Decission Tree Classifaier
pred_tree = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_list=[accuracy_score(y_test, pred_tree)
                    ,accuracy_score(y_test, pred_forest)
                    ,accuracy_score(y_test, pred_clf)
                    ,accuracy_score(y_test, pred_SVM)
                    ,accuracy_score(y_test, pred_KNN)
                    ,accuracy_score(y_test, pred_neural_network)]

pred_list_max = pred_list.index(max(pred_list))

if pred_list_max == 0:
    first_party_ML = pred_tree
elif pred_list_max == 1:
    first_party_ML = pred_forest
elif pred_list_max == 2:
    first_party_ML = pred_clf
elif pred_list_max == 3:
    first_party_ML = pred_SVM
elif pred_list_max == 4:
    first_party_ML = pred_KNN
else:
    first_party_ML = pred_neural_network



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Third Party Transfer Parameter

In [8]:
##Third Party Transfer Matrix:
Phrases = ['Privacy Policy'
           ,'we share personal information'
           ,'the data is being transferred to third party'
           ,'We will share your personal data with third parties'
           ,'we will transferred your data to third party'
           ,'third parties have access to your Personal Data'
           ,'We will share your personal data with third parties'
           ,'Guest User personal data may be shared with third parties for these purposes'
           ,'we do not share your personal data with third parties outside of the organization unless one of the following circumstances applies'
           ,'we may also transfer such personal data to third parties'
           ,'We will not sell or lease your personal information to third parties unless we have your permission or are required by law to do so']

##Matrix creation:
matrix_third_party_ML = pd.DataFrame(columns = Phrases)
matrix_third_party_ML['Privacy Policy'] = websites
matrix_third_party_ML.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_third_party_ML:
        matrix_third_party_ML.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_third_party_ML.iloc[:35,:] 
X_test = matrix_third_party_ML.iloc[34:,:]
y_train = third_party_train[:35]
y_test = third_party_train[34:]


##Decission Tree Classifaier
pred_tree = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_list=[accuracy_score(y_test, pred_tree)
                    ,accuracy_score(y_test, pred_forest)
                    ,accuracy_score(y_test, pred_clf)
                    ,accuracy_score(y_test, pred_SVM)
                    ,accuracy_score(y_test, pred_KNN)
                    ,accuracy_score(y_test, pred_neural_network)]

pred_list_max = pred_list.index(max(pred_list))

if pred_list_max == 0:
    third_party_ML = pred_tree
elif pred_list_max == 1:
    third_party_ML = pred_forest
elif pred_list_max == 2:
    third_party_ML = pred_clf
elif pred_list_max == 3:
    third_party_ML = pred_SVM
elif pred_list_max == 4:
    third_party_ML = pred_KNN
else:
    third_party_ML = pred_neural_network


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Edit Delete Data Parameter

In [9]:
##Edit delete data Matrix:

Phrases = ['Privacy Policy'
           ,'YOU CAN EDIT AND DELETE YOUR DATA ANY TIME'
           ,'If your personally identifiable information changes, or if you no longer desire our service, you may update it or request deletion by contacting us'
           ,'you have a partial ability to edit and delete your personal data'
           ,'you can not edit or delete your data'
           ,'you dont have permissions to edit or delete your data'
           ,'You can choose to delete, change or correct your personal information or access your personal information'
           ,'You have the right to request that we delete certain personal information we have collected from you']

##Matrix creation:
matrix_edit_delete_ML = pd.DataFrame(columns = Phrases)
matrix_edit_delete_ML['Privacy Policy'] = websites
matrix_edit_delete_ML.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_edit_delete_ML:
        matrix_edit_delete_ML.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
# Manual division so we will have all options (1/0) at both groups
X_train = matrix_edit_delete_ML.iloc[:35,:] 
X_test = matrix_edit_delete_ML.iloc[34:,:]
y_train = user_edit_data_train[:35]
y_test = user_edit_data_train[34:]


##Decission Tree Classifaier
pred_tree = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_list=[accuracy_score(y_test, pred_tree)
                    ,accuracy_score(y_test, pred_forest)
                    ,accuracy_score(y_test, pred_clf)
                    ,accuracy_score(y_test, pred_SVM)
                    ,accuracy_score(y_test, pred_KNN)
                    ,accuracy_score(y_test, pred_neural_network)]

pred_list_max = pred_list.index(max(pred_list))

if pred_list_max == 0:
    edit_delete_ML = pred_tree
elif pred_list_max == 1:
    edit_delete_ML = pred_forest
elif pred_list_max == 2:
    edit_delete_ML = pred_clf
elif pred_list_max == 3:
    edit_delete_ML = pred_SVM
elif pred_list_max == 4:
    edit_delete_ML = pred_KNN
else:
    edit_delete_ML = pred_neural_network


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Data Security Parameter

In [50]:
data_security_train_2FA = [1,1,0,0,0,0,0,0,1,0,1,1,1,1,1,0,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0]
data_security_train_GDPR = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0]
data_security_train_other = [1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,1,0,0,1,1,1]
#[1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,1,0,0,0,0]
# data_security_train_No = [0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1]
data_security_train_PCI_DSS = [0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

In [11]:
##data_security Matrix - 2FA [1]:
Phrases = ['Privacy Policy'
           ,'we are using 2FA method'
           ,'we are using two factor autentication method'
           ,'we are not using two factor autentication method'
           ,'we are not using 2FA method']

##Matrix creation:
matrix_data_security_ML_2FA = pd.DataFrame(columns = Phrases)
matrix_data_security_ML_2FA['Privacy Policy'] = websites
matrix_data_security_ML_2FA.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_security_ML_2FA:
        matrix_data_security_ML_2FA.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_data_security_ML_2FA.iloc[:35,:] 
X_test = matrix_data_security_ML_2FA.iloc[34:,:]
y_train = data_security_train_2FA[:35]
y_test = data_security_train_2FA[34:]


##Decission Tree Classifaier
pred_tree_2FA = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_2FA = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_2FA = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_2FA = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_2FA = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network_2FA = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_2FA_list=[accuracy_score(y_test, pred_tree_2FA)
                    ,accuracy_score(y_test, pred_forest_2FA)
                    ,accuracy_score(y_test, pred_clf_2FA)
                    ,accuracy_score(y_test, pred_SVM_2FA)
                    ,accuracy_score(y_test, pred_KNN_2FA)
                    ,accuracy_score(y_test, pred_neural_network_2FA)]

pred_2FA_list_max = pred_2FA_list.index(max(pred_2FA_list))

if pred_2FA_list_max == 0:
    pred_2FA = pred_tree_2FA
elif pred_2FA_list_max == 1:
    pred_2FA = pred_forest_2FA
elif pred_2FA_list_max == 2:
    pred_2FA = pred_clf_2FA
elif pred_2FA_list_max == 3:
    pred_2FA = pred_SVM_2FA
elif pred_2FA_list_max == 4:
    pred_2FA = pred_KNN_2FA
else:
    pred_2FA = pred_neural_network_2FA


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [12]:
##data_security Matrix - GDPR [2]:
Phrases = ['Privacy Policy'
           ,'we are follow GDPR'
           ,'we are not follow GDPR'
           ,'our privacy policy is Under General Data Protection Regulation']

##Matrix creation:
matrix_data_security_ML_GDPR = pd.DataFrame(columns = Phrases)
matrix_data_security_ML_GDPR['Privacy Policy'] = websites
matrix_data_security_ML_GDPR.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_security_ML_GDPR:
        matrix_data_security_ML_GDPR.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_data_security_ML_GDPR.iloc[:35,:] 
X_test = matrix_data_security_ML_GDPR.iloc[34:,:]
y_train = data_security_train_GDPR[:35]
y_test = data_security_train_GDPR[34:]


##Decission Tree Classifaier
pred_tree_GDPR = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_GDPR = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_GDPR = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_GDPR = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_GDPR = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network_GDPR = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_GDPR_list=[accuracy_score(y_test, pred_tree_GDPR)
                    ,accuracy_score(y_test, pred_forest_GDPR)
                    ,accuracy_score(y_test, pred_clf_GDPR)
                    ,accuracy_score(y_test, pred_SVM_GDPR)
                    ,accuracy_score(y_test, pred_KNN_GDPR)
                    ,accuracy_score(y_test, pred_neural_network_GDPR)]

pred_GDPR_list_max = pred_GDPR_list.index(max(pred_GDPR_list))

if pred_GDPR_list_max == 0:
    pred_GDPR = pred_tree_GDPR
elif pred_GDPR_list_max == 1:
    pred_GDPR = pred_forest_GDPR
elif pred_GDPR_list_max == 2:
    pred_GDPR = pred_clf_GDPR
elif pred_GDPR_list_max == 3:
    pred_GDPR = pred_SVM_GDPR
elif pred_GDPR_list_max == 4:
    pred_GDPR = pred_KNN_GDPR
else:
    pred_GDPR = pred_neural_network_GDPR


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [13]:
##data_security Matrix - other protocols [3]:
Phrases = ['Privacy Policy'
           ,'we secure your data with other protocols'
           ,'your data is secure by different protocols'
           ,'we do not secure your data']

##Matrix creation:
matrix_data_security_ML_other = pd.DataFrame(columns = Phrases)
matrix_data_security_ML_other['Privacy Policy'] = websites
matrix_data_security_ML_other.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_security_ML_other:
        matrix_data_security_ML_other.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_data_security_ML_other.iloc[:35,:] 
X_test = matrix_data_security_ML_other.iloc[34:,:]
y_train = data_security_train_other[:35]
y_test = data_security_train_other[34:]


##Decission Tree Classifaier
pred_tree_other = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_other = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_other = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_other = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_other = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network_other = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_other_list=[accuracy_score(y_test, pred_tree_other)
                    ,accuracy_score(y_test, pred_forest_other)
                    ,accuracy_score(y_test, pred_clf_other)
                    ,accuracy_score(y_test, pred_SVM_other)
                    ,accuracy_score(y_test, pred_KNN_other)
                    ,accuracy_score(y_test, pred_neural_network_other)]

pred_other_list_max = pred_other_list.index(max(pred_other_list))

if pred_other_list_max == 0:
    pred_other = pred_tree_other
elif pred_other_list_max == 1:
    pred_other = pred_forest_other
elif pred_other_list_max == 2:
    pred_other = pred_clf_other
elif pred_other_list_max == 3:
    pred_other = pred_SVM_other
elif pred_other_list_max == 4:
    pred_other = pred_KNN_other
else:
    pred_other = pred_neural_network_other


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [14]:
# ##data_security Matrix - no secure [4]:
# Phrases = ['Privacy Policy'
#            ,'we secure your data with other protocols'
#            ,'your data is secure by different protocols'
#            ,'we do not secure your data']

# ##Matrix creation:
# matrix_data_security_ML_No = pd.DataFrame(columns = Phrases)
# matrix_data_security_ML_No['Privacy Policy'] = websites
# matrix_data_security_ML_No.set_index('Privacy Policy', inplace=True)

# ##Filling the matrix:
# line = 0
# for j in sequence_to_classify:
#     for i in matrix_data_security_ML_No:
#         matrix_data_security_ML_No.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
#     line += 1

# X_train = matrix_data_security_ML_No.iloc[:35,:] 
# X_test = matrix_data_security_ML_No.iloc[34:,:]
# y_train = data_security_train_No[:35]
# y_test = data_security_train_No[34:]

# ##Decission Tree Classifaier
# pred_tree_No = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

# ##Random Forest Classifaier
# pred_forest_No = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

# ##Logistic Reg Classifier
# pred_clf_No = LogisticRegression().fit(X_train, y_train).predict(X_test)

# ##SVM
# pred_SVM_No = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

# ##KNN
# pred_KNN_No = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

# ##neural_network 
# pred_neural_network_No = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


# #Best ML Model
# pred_No_list=[accuracy_score(y_test, pred_tree_No)
#                     ,accuracy_score(y_test, pred_forest_No)
#                     ,accuracy_score(y_test, pred_clf_No)
#                     ,accuracy_score(y_test, pred_SVM_No)
#                     ,accuracy_score(y_test, pred_KNN_No)
#                     ,accuracy_score(y_test, pred_neural_network_No)]

# pred_No_list_max = pred_No_list.index(max(pred_No_list))

# if pred_No_list_max == 0:
#     pred_No = pred_tree_No
# elif pred_No_list_max == 1:
#     pred_No = pred_forest_No
# elif pred_No_list_max == 2:
#     pred_No = pred_clf_No
# elif pred_No_list_max == 3:
#     pred_No = pred_SVM_No
# elif pred_No_list_max == 4:
#     pred_No = pred_KNN_No
# else:
#     pred_No = pred_neural_network_No


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [15]:
##data_security Matrix - according to PCI DSS [5]:
Phrases = ['Privacy Policy'
           ,'we are follow PCI DSS'
           ,'we are not follow PCI DSS']

##Matrix creation:
matrix_data_security_ML_PCI_DSS = pd.DataFrame(columns = Phrases)
matrix_data_security_ML_PCI_DSS['Privacy Policy'] = websites
matrix_data_security_ML_PCI_DSS.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_security_ML_PCI_DSS:
        matrix_data_security_ML_PCI_DSS.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
# Manual division so we will have all options (1/0) at both groups
X_train = matrix_data_security_ML_PCI_DSS.iloc[:35,:] 
X_test = matrix_data_security_ML_PCI_DSS.iloc[34:,:]
y_train = data_security_train_PCI_DSS[:35]
y_test = data_security_train_PCI_DSS[34:]

##Decission Tree Classifaier
pred_tree_PCI_DSS = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_PCI_DSS = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_PCI_DSS = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_PCI_DSS = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_PCI_DSS = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network_PCI_DSS = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_PCI_DSS_list=[accuracy_score(y_test, pred_tree_PCI_DSS)
                    ,accuracy_score(y_test, pred_forest_PCI_DSS)
                    ,accuracy_score(y_test, pred_clf_PCI_DSS)
                    ,accuracy_score(y_test, pred_SVM_PCI_DSS)
                    ,accuracy_score(y_test, pred_KNN_PCI_DSS)
                    ,accuracy_score(y_test, pred_neural_network_PCI_DSS)]

pred_PCI_DSS_list_max = pred_PCI_DSS_list.index(max(pred_PCI_DSS_list))

if pred_PCI_DSS_list_max == 0:
    pred_PCI_DSS = pred_tree_PCI_DSS
elif pred_PCI_DSS_list_max == 1:
    pred_PCI_DSS = pred_forest_PCI_DSS
elif pred_PCI_DSS_list_max == 2:
    pred_PCI_DSS = pred_clf_PCI_DSS
elif pred_PCI_DSS_list_max == 3:
    pred_PCI_DSS = pred_SVM_PCI_DSS
elif pred_PCI_DSS_list_max == 4:
    pred_PCI_DSS = pred_KNN_PCI_DSS
else:
    pred_PCI_DSS = pred_neural_network_PCI_DSS


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [53]:
data_security_train_pred = []
lst_temp = []
#par = [pred_2FA, pred_GDPR, pred_other, pred_No, pred_PCI_DSS]
par = [pred_2FA, pred_GDPR, pred_other, pred_PCI_DSS]

index = 1
for i in range(0,15):
    for j in par:
        if j[i]==1:
            lst_temp.append(index)
        index += 1
    index =1
    data_security_train_pred.append(lst_temp)
    lst_temp = []
  
index = 0
for i in data_security_train_pred:
    if i == []:
        data_security_train_pred[index] = [3]
    index+=1
    
data_security_train_pred

[[1, 3],
 [1, 3],
 [1],
 [1],
 [3],
 [1],
 [1, 3],
 [3],
 [3],
 [1, 3],
 [1, 3],
 [1],
 [3],
 [3],
 [3]]

### Data Collection Parameter

In [55]:
data_collection_train_personal = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
data_collection_train_media = [1,1,0,0,1,1,0,0,1,1,1,0,0,1,0,1,1,1,0,1,0,1,1,0,0,0,1,1,0,1,1,1,1,1,0,0,1,0,1,1,1,0,0,0,1,0,1,0,0]
data_collection_train_browser = [1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
data_collection_train_voice = [1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,1,0,0]
data_collection_train_payments = [1,1,1,0,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,1,1,1,1]
data_collection_train_location = [1,1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,0,0,1,1,1,1,1,1]
data_collection_train_IP = [1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
data_collection_train_passward = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,1,0,1,0,1,0]

In [18]:
##data_security Matrix - _personal [1]:
Phrases = ['Privacy Policy'
           ,'we save your name, address, phone number, birth date, or email address'
           ,'we are not collect your name, address, phone number, birth date, and email address'
           ,'we collect your personal data'
           ,'we do not collect your personal data']

##Matrix creation:
matrix_data_collection_ML_personal = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_personal['Privacy Policy'] = websites
matrix_data_collection_ML_personal.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_personal:
        matrix_data_collection_ML_personal.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_personal.iloc[:35,:] 
X_test = matrix_data_collection_ML_personal.iloc[34:,:]
y_train = data_collection_train_personal[:35]
y_test = data_collection_train_personal[34:]

##Decission Tree Classifaier
pred_tree_personal = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_personal = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

# ##Logistic Reg Classifier
# pred_clf_personal = LogisticRegression().fit(X_train, y_train).predict(X_test)

# ##SVM
# pred_SVM_personal = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_personal = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_personal = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_personal_list=[accuracy_score(y_test, pred_tree_personal)
                    ,accuracy_score(y_test, pred_forest_personal)
                    #,accuracy_score(y_test, pred_clf_personal)
                    #,accuracy_score(y_test, pred_SVM_personal)
                    ,accuracy_score(y_test, pred_KNN_personal)
                    ,accuracy_score(y_test, pred_neural_network_personal)]

pred_personal_list_max = pred_personal_list.index(max(pred_personal_list))

if pred_personal_list_max == 0:
    pred_personal = pred_tree_personal
elif pred_personal_list_max == 1:
    pred_personal = pred_forest_personal
# elif pred_personal_list_max == 2:
#     pred_personal = pred_clf_personal
# elif pred_personal_list_max == 3:
#     pred_personal = pred_SVM_personal
elif pred_personal_list_max == 2:
    pred_personal = pred_KNN_personal
else:
    pred_personal = pred_neural_network_personal
    

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [19]:
##data_security Matrix - _media [2]:

Phrases = ['Privacy Policy'
           ,'we save your media'
           ,'we are not collect your media'
           ,'we collect your photos and images'
           ,'we are not saving your photos and images']

##Matrix creation:
matrix_data_collection_ML_media = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_media['Privacy Policy'] = websites
matrix_data_collection_ML_media.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_media:
        matrix_data_collection_ML_media.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
 
X_train = matrix_data_collection_ML_media.iloc[:35,:] 
X_test = matrix_data_collection_ML_media.iloc[34:,:]
y_train = data_collection_train_media[:35]
y_test = data_collection_train_media[34:]

##Decission Tree Classifaier
pred_tree_media = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_media = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_media = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_media = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_media = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_media = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_media_list=[accuracy_score(y_test, pred_tree_media)
                    ,accuracy_score(y_test, pred_forest_media)
                    ,accuracy_score(y_test, pred_clf_media)
                    ,accuracy_score(y_test, pred_SVM_media)
                    ,accuracy_score(y_test, pred_KNN_media)
                    ,accuracy_score(y_test, pred_neural_network_media)]

pred_media_list_max = pred_media_list.index(max(pred_media_list))

if pred_media_list_max == 0:
    pred_media = pred_tree_media
elif pred_media_list_max == 1:
    pred_media = pred_forest_media
elif pred_media_list_max == 2:
    pred_media = pred_clf_media
elif pred_media_list_max == 3:
    pred_media = pred_SVM_media
elif pred_media_list_max == 4:
    pred_media = pred_KNN_media
else:
    pred_media = pred_neural_network_media
    

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [20]:
##data_security Matrix - _browser [3]:
Phrases = ['Privacy Policy'
           ,'we save your browsing history'
           ,'we save your browsing preferences'
           ,'we are not collect your browsing history'
           ,'we are not collect your browsing preferences']

##Matrix creation:
matrix_data_collection_ML_browser = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_browser['Privacy Policy'] = websites
matrix_data_collection_ML_browser.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_browser:
        matrix_data_collection_ML_browser.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_browser.iloc[:35,:] 
X_test = matrix_data_collection_ML_browser.iloc[34:,:]
y_train = data_collection_train_browser[:35]
y_test = data_collection_train_browser[34:]

##Decission Tree Classifaier
pred_tree_browser = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_browser = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_browser = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_browser = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_browser = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_browser = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_browser_list=[accuracy_score(y_test, pred_tree_browser)
                    ,accuracy_score(y_test, pred_forest_browser)
                    ,accuracy_score(y_test, pred_clf_browser)
                    ,accuracy_score(y_test, pred_SVM_browser)
                    ,accuracy_score(y_test, pred_KNN_browser)
                    ,accuracy_score(y_test, pred_neural_network_browser)]

pred_browser_list_max = pred_browser_list.index(max(pred_browser_list))

if pred_browser_list_max == 0:
    pred_browser = pred_tree_browser
elif pred_browser_list_max == 1:
    pred_browser = pred_forest_browser
elif pred_browser_list_max == 2:
    pred_browser = pred_clf_browser
elif pred_browser_list_max == 3:
    pred_browser = pred_SVM_browser
elif pred_browser_list_max == 4:
    pred_browser = pred_KNN_browser
else:
    pred_browser = pred_neural_network_browser
    

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [21]:
##data_security Matrix - data_collection_train_voice [4]:
Phrases = ['Privacy Policy'
           ,'we save your recording'
           ,'we save your audio'
           ,'we are collecting your voice messages'
           ,'we are not collect your audio and recording']

##Matrix creation:
matrix_data_collection_ML_voice = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_voice['Privacy Policy'] = websites
matrix_data_collection_ML_voice.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_voice:
        matrix_data_collection_ML_voice.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_voice.iloc[:35,:] 
X_test = matrix_data_collection_ML_voice.iloc[34:,:]
y_train = data_collection_train_voice[:35]
y_test = data_collection_train_voice[34:]

##Decission Tree Classifaier
pred_tree_voice = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_voice = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_voice = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_voice = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_voice = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network 
pred_neural_network_voice = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_voice_list=[accuracy_score(y_test, pred_tree_voice)
                    ,accuracy_score(y_test, pred_forest_voice)
                    ,accuracy_score(y_test, pred_clf_voice)
                    ,accuracy_score(y_test, pred_SVM_voice)
                    ,accuracy_score(y_test, pred_KNN_voice)
                    ,accuracy_score(y_test, pred_neural_network_voice)]

pred_voice_list_max = pred_voice_list.index(max(pred_voice_list))

if pred_voice_list_max == 0:
    pred_voice = pred_tree_voice
elif pred_voice_list_max == 1:
    pred_voice = pred_forest_voice
elif pred_voice_list_max == 2:
    pred_voice = pred_clf_voice
elif pred_voice_list_max == 3:
    pred_voice = pred_SVM_voice
elif pred_voice_list_max == 4:
    pred_voice = pred_KNN_voice
else:
    pred_voice = pred_neural_network_voice


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [22]:
##data_security Matrix - data_collection_train_payments [5]:
Phrases = ['Privacy Policy'
           ,'we save your payments data'
           ,'we collect your payments details'
           ,'we are not collect your payments data']

##Matrix creation:
matrix_data_collecting_ML_payments = pd.DataFrame(columns = Phrases)
matrix_data_collecting_ML_payments['Privacy Policy'] = websites
matrix_data_collecting_ML_payments.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collecting_ML_payments:
        matrix_data_collecting_ML_payments.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
X_train = matrix_data_collecting_ML_payments.iloc[:35,:] 
X_test = matrix_data_collecting_ML_payments.iloc[34:,:]
y_train = data_collection_train_payments[:35]
y_test = data_collection_train_payments[34:]

##Decission Tree Classifaier
pred_tree_payments = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_payments = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_payments = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_payments = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_payments = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_payments = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)

#Best ML Model
pred_payments_list=[accuracy_score(y_test, pred_tree_payments)
                    ,accuracy_score(y_test, pred_forest_payments)
                    ,accuracy_score(y_test, pred_clf_payments)
                    ,accuracy_score(y_test, pred_SVM_payments)
                    ,accuracy_score(y_test, pred_KNN_payments)
                    ,accuracy_score(y_test, pred_neural_network_payments)]

pred_payments_list_max = pred_payments_list.index(max(pred_payments_list))

if pred_payments_list_max == 0:
    pred_payments = pred_tree_payments
elif pred_payments_list_max == 1:
    pred_payments = pred_forest_payments
elif pred_payments_list_max == 2:
    pred_payments = pred_clf_payments
elif pred_payments_list_max == 3:
    pred_payments = pred_SVM_payments
elif pred_payments_list_max == 4:
    pred_payments = pred_KNN_payments
else:
    pred_payments = pred_neural_network_payments


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [23]:
##data_security Matrix - data_collection_train_location [6]:
Phrases = ['Privacy Policy'
           ,'we save your location'
           ,'we are not collect your location'
           ,'we save your language'
           ,'we are not collect your language']

##Matrix creation:
matrix_data_collection_ML_location = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_location['Privacy Policy'] = websites
matrix_data_collection_ML_location.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_location:
        matrix_data_collection_ML_location.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_location.iloc[:35,:] 
X_test = matrix_data_collection_ML_location.iloc[34:,:]
y_train = data_collection_train_location[:35]
y_test = data_collection_train_location[34:]

##Decission Tree Classifaier
pred_tree_location = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_location = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_location = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_location = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_location = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_location = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)


#Best ML Model
pred_location_list=[accuracy_score(y_test, pred_tree_location)
                    ,accuracy_score(y_test, pred_forest_location)
                    ,accuracy_score(y_test, pred_clf_location)
                    ,accuracy_score(y_test, pred_SVM_location)
                    ,accuracy_score(y_test, pred_KNN_location)
                    ,accuracy_score(y_test, pred_neural_network_location)]

pred_location_list_max = pred_location_list.index(max(pred_location_list))

if pred_location_list_max == 0:
    pred_location = pred_tree_location
elif pred_location_list_max == 1:
    pred_location = pred_forest_location
elif pred_location_list_max == 2:
    pred_location = pred_clf_location
elif pred_location_list_max == 3:
    pred_location = pred_SVM_location
elif pred_location_list_max == 4:
    pred_location = pred_KNN_location
else:
    pred_location = pred_neural_network_location


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [24]:
##data_security Matrix - data_collection_train_IP [7]:
Phrases = ['Privacy Policy'
           ,'we save your IP address'
           ,'we are not collect your browser and device details'
           ,'we save your browser and device details'
           ,'we are not collect your IP address']

##Matrix creation:
matrix_data_collection_ML_IP = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_IP['Privacy Policy'] = websites
matrix_data_collection_ML_IP.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_IP:
        matrix_data_collection_ML_IP.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_IP.iloc[:35,:] 
X_test = matrix_data_collection_ML_IP.iloc[34:,:]
y_train = data_collection_train_IP[:35]
y_test = data_collection_train_IP[34:]

##Decission Tree Classifaier
pred_tree_IP = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_IP = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_IP = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_IP = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_IP = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_IP = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test)

#Best ML Model
pred_IP_list=[accuracy_score(y_test, pred_tree_IP)
                    ,accuracy_score(y_test, pred_forest_IP)
                    ,accuracy_score(y_test, pred_clf_IP)
                    ,accuracy_score(y_test, pred_SVM_IP)
                    ,accuracy_score(y_test, pred_KNN_IP)
                    ,accuracy_score(y_test, pred_neural_network_IP)]

pred_IP_list_max = pred_IP_list.index(max(pred_IP_list))

if pred_IP_list_max == 0:
    pred_IP = pred_tree_IP
elif pred_IP_list_max == 1:
    pred_IP = pred_forest_IP
elif pred_IP_list_max == 2:
    pred_IP = pred_clf_IP
elif pred_IP_list_max == 3:
    pred_IP = pred_SVM_IP
elif pred_IP_list_max == 4:
    pred_IP = pred_KNN_IP
else:
    pred_IP = pred_neural_network_IP

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [25]:
##data_security Matrix - data_collection_train_passward [8]:
Phrases = ['Privacy Policy'
           ,'we save your passwords'
           ,'we are not collect your passwords']

##Matrix creation:
matrix_data_collection_ML_passward = pd.DataFrame(columns = Phrases)
matrix_data_collection_ML_passward['Privacy Policy'] = websites
matrix_data_collection_ML_passward.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_data_collection_ML_passward:
        matrix_data_collection_ML_passward.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1

X_train = matrix_data_collection_ML_passward.iloc[:35,:]
X_test = matrix_data_collection_ML_passward.iloc[34:,:]
y_train = data_collection_train_passward[:35]
y_test = data_collection_train_passward[34:]

##Decission Tree Classifaier
pred_tree_passward = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)

##Random Forest Classifaier
pred_forest_passward = RandomForestClassifier().fit(X_train, y_train).predict(X_test).round()

##Logistic Reg Classifier
pred_clf_passward = LogisticRegression().fit(X_train, y_train).predict(X_test)

##SVM
pred_SVM_passward = SVC(kernel='linear').fit(X_train, y_train).predict(X_test)

##KNN
pred_KNN_passward = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2).fit(X_train, y_train).predict(X_test)

##neural_network
pred_neural_network_passward = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train).predict(X_test) 

#Best ML Model
pred_passward_list=[accuracy_score(y_test, pred_tree_passward)
                    ,accuracy_score(y_test, pred_forest_passward)
                    ,accuracy_score(y_test, pred_clf_passward)
                    ,accuracy_score(y_test, pred_SVM_passward)
                    ,accuracy_score(y_test, pred_KNN_passward)
                    ,accuracy_score(y_test, pred_neural_network_passward)]

pred_passward_list_max = pred_passward_list.index(max(pred_passward_list))
if pred_passward_list_max == 0:
    pred_passward = pred_tree_passward
elif pred_passward_list_max == 1:
    pred_passward = pred_forest_passward
elif pred_passward_list_max == 2:
    pred_passward = pred_clf_passward
elif pred_passward_list_max == 3:
    pred_passward = pred_SVM_passward
elif pred_passward_list_max == 4:
    pred_passward = pred_KNN_passward
else:
    pred_passward = pred_neural_network_passward

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [26]:
data_collection_train_pred = []
lst_temp = []
par = [pred_personal, pred_media, pred_browser, pred_voice, pred_payments, pred_location, pred_IP, pred_passward]

index = 1
for i in range(0,15):
    for j in par:
        if j[i]==1:
            lst_temp.append(index)
        index += 1
    index =1
    data_collection_train_pred.append(lst_temp)
    lst_temp = []
    
data_collection_train_pred

[[1, 3, 4, 5, 6, 7],
 [1, 3, 5, 6, 7],
 [1, 3, 5, 6, 7],
 [1, 3, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 5, 6, 7],
 [1, 3, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 3, 5, 6, 7],
 [1, 3, 5, 6, 7, 8],
 [1, 3, 5, 6, 7],
 [1, 2, 3, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 3, 4, 5, 6, 7],
 [1, 3, 5, 6, 7]]

## Privacy Policy DB

In [38]:
##Privacy Policy DB:

Phrases = ['Privacy Policy'
           ,'Privacy Policy Change'
           ,'First Party Use'
           ,'Third Party Use'
           ,'Data Security'
           ,'Data Collection'
           ,'Access, Edit, & Deletion Data']

##Matrix creation:
final_matrix = pd.DataFrame(columns = Phrases)
final_matrix['Privacy Policy'] = websites
final_matrix.set_index('Privacy Policy', inplace=True)

#Filling the matrix:
line = 0
index = 0
index_2 = 0
for j in range(0,49):
    if index <= 35:
        final_matrix.loc[websites[line]]['Privacy Policy Change'] = policy_change_train[index] ##policy_change_train_pred_Comparison[index]
        final_matrix.loc[websites[line]]['First Party Use'] = first_party_use_train[index]
        final_matrix.loc[websites[line]]['Third Party Use'] = third_party_train[index]
        final_matrix.loc[websites[line]]['Access, Edit, & Deletion Data'] = user_edit_data_train[index]
        final_matrix.loc[websites[line]]['Data Security'] = data_security_train[index]
        final_matrix.loc[websites[line]]['Data Collection'] = data_collection_train[index]
    else:
        final_matrix.loc[websites[line]]['Privacy Policy Change'] = policy_change_ML[index_2] ##policy_change_train_pred_Comparison[index]
        final_matrix.loc[websites[line]]['First Party Use'] = first_party_ML[index_2]
        final_matrix.loc[websites[line]]['Third Party Use'] = third_party_ML[index_2]
        final_matrix.loc[websites[line]]['Access, Edit, & Deletion Data'] = edit_delete_ML[index_2]
        final_matrix.loc[websites[line]]['Data Security'] = data_security_train_pred[index_2]
        final_matrix.loc[websites[line]]['Data Collection'] = data_collection_train_pred[index_2]
        index_2 += 1
    line += 1
    index += 1
final_matrix

Unnamed: 0_level_0,Privacy Policy Change,First Party Use,Third Party Use,Data Security,Data Collection,"Access, Edit, & Deletion Data"
Privacy Policy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Google,1,1,1,"[1, 3]","[1, 2, 3, 4, 5, 6, 7, 8]",1
Aws,1,1,1,"[1, 3, 4]","[1, 2, 3, 4, 5, 6]",1
AliExpress,1,1,1,[3],"[1, 3, 5, 6, 7]",1
Meta,1,1,1,[2],"[1, 3, 7]",1
TikTok,1,1,1,[3],"[1, 2, 3, 4, 6, 7]",1
YouTube,1,1,1,[3],"[1, 2, 3, 4, 5, 6, 7]",1
Waze,1,1,1,[3],"[1, 4, 5, 6, 7]",1
Wix,1,1,1,"[3, 4]","[1, 3, 5, 6, 7]",1
Bookings,1,1,1,[1],"[1, 2, 3, 5, 6, 7]",1
whatsapp,1,1,1,[3],"[1, 2, 3, 4, 5, 6, 7]",1


### Measures

In [28]:
# #Privacy Policy Parameter:
# index = 0
# TP = 0
# FP = 0
# FN = 0
# TN = 0
# for i in policy_change_train:
#     if policy_change_train_pred_Comparison[index]==1 and policy_change_train[index]==1:
#         TP +=1
#     elif policy_change_train_pred_Comparison[index]==0 and policy_change_train[index]==0:
#         TN +=1
#     elif policy_change_train_pred_Comparison[index]==0 and policy_change_train[index]==1:
#         FN +=1
#     else:
#         FP +=1
#     index +=1

# recall = TP/(TP+FN)
# Precision = TP/(TP+FP)
# Accuracy = (TP+TN)/(TP+FP+FN+TN) 
# F1 = (TP)/(TP+(0.5*(FP+FN)))

# print("recall:" , recall)
# print("Precision:" , Precision)
# print("Accuracy:" , Accuracy)
# print("F1 Score:" , F1)

In [29]:
#Privacy Policy Parameter:
X_test = policy_change_ML[34:]
y_test = policy_change_train[34:]
print(classification_report(y_test, policy_change_ML))

              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.83      1.00      0.91        10

    accuracy                           0.87        15
   macro avg       0.92      0.80      0.83        15
weighted avg       0.89      0.87      0.86        15



In [30]:
#First Party Use Parameter:
X_test = first_party_ML[34:]
y_test = first_party_use_train[34:]
print(classification_report(y_test, first_party_ML))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



In [31]:
#Third Party Transfer Parameter:
X_test = third_party_ML[34:]
y_test = third_party_train[34:]
print(classification_report(y_test, third_party_ML))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.87      1.00      0.93        13

    accuracy                           0.87        15
   macro avg       0.43      0.50      0.46        15
weighted avg       0.75      0.87      0.80        15



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
#Edit & Delete Data Parameter:
X_test = edit_delete_ML[34:]
y_test = user_edit_data_train[34:]
print(classification_report(y_test, edit_delete_ML))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.86      1.00      0.92        12

    accuracy                           0.87        15
   macro avg       0.93      0.67      0.71        15
weighted avg       0.89      0.87      0.84        15



In [54]:
#Data Security Parameter:
index = 0
TP = 0
FP = 0
FN = 0
TN = 0
for i in data_security_train_2FA[34:]:
    if pred_2FA[index]==1 and data_security_train_2FA[index+34]==1:
        TP +=1
    elif pred_2FA[index]==0 and data_security_train_2FA[index+34]==0:
        TN +=1
    elif pred_2FA[index]==0 and data_security_train_2FA[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1

index = 0
for i in data_security_train_GDPR[34:]:
    if pred_GDPR[index]==1 and data_security_train_GDPR[index+34]==1:
        TP +=1
    elif pred_GDPR[index]==0 and data_security_train_GDPR[index+34]==0:
        TN +=1
    elif pred_GDPR[index]==0 and data_security_train_GDPR[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0
for i in data_security_train_other[34:]:
    if pred_other[index]==1 and data_security_train_other[index+34]==1:
        TP +=1
    elif pred_other[index]==0 and data_security_train_other[index+34]==0:
        TN +=1
    elif pred_other[index]==0 and data_security_train_other[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
# index = 0    
# for i in data_security_train_No[34:]:
#     if pred_No[index]==1 and data_security_train_No[index+34]==1:
#         TP +=1
#     elif pred_No[index]==0 and data_security_train_No[index+34]==0:
#         TN +=1
#     elif pred_No[index]==0 and data_security_train_No[index+34]==1:
#         FN +=1
#     else:
#         FP +=1
#     index +=1
    
index = 0    
for i in data_security_train_PCI_DSS[34:]:
    if pred_PCI_DSS[index]==1 and data_security_train_PCI_DSS[index+34]==1:
        TP +=1
    elif pred_PCI_DSS[index]==0 and data_security_train_PCI_DSS[index+34]==0:
        TN +=1
    elif pred_PCI_DSS[index]==0 and data_security_train_PCI_DSS[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
recall = TP/(TP+FN)
Precision = TP/(TP+FP)
Accuracy = (TP+TN)/(TP+FP+FN+TN) 
F1 = (TP)/(TP+(0.5*(FP+FN)))

print("recall:" , recall)
print("Precision:" , Precision)
print("Accuracy:" , Accuracy)
print("F1 Score:" , F1)

recall: 0.5294117647058824
Precision: 0.5294117647058824
Accuracy: 0.7333333333333333
F1 Score: 0.5294117647058824


In [56]:
#Data Collection Parameter:
index = 0
TP = 0
FP = 0
FN = 0
TN = 0
for i in data_collection_train_personal[34:]:
    if pred_personal[index]==1 and data_collection_train_personal[index+34]==1:
        TP +=1
    elif pred_personal[index]==0 and data_collection_train_personal[index+34]==0:
        TN +=1
    elif pred_personal[index]==0 and data_collection_train_personal[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0
for i in data_collection_train_media[34:]:
    if pred_media[index]==1 and data_collection_train_media[index+34]==1:
        TP +=1
    elif pred_media[index]==0 and data_collection_train_media[index+34]==0:
        TN +=1
    elif pred_media[index]==0 and data_collection_train_media[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0
for i in data_collection_train_browser[34:]:
    if pred_browser[index]==1 and data_collection_train_browser[index+34]==1:
        TP +=1
    elif pred_browser[index]==0 and data_collection_train_browser[index+34]==0:
        TN +=1
    elif pred_browser[index]==0 and data_collection_train_browser[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0    
for i in data_collection_train_voice[34:]:
    if pred_voice[index]==1 and data_collection_train_voice[index+34]==1:
        TP +=1
    elif pred_voice[index]==0 and data_collection_train_voice[index+34]==0:
        TN +=1
    elif pred_voice[index]==0 and data_collection_train_voice[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0   
for i in data_collection_train_payments[34:]:
    if pred_payments[index]==1 and data_collection_train_payments[index+34]==1:
        TP +=1
    elif pred_payments[index]==0 and data_collection_train_payments[index+34]==0:
        TN +=1
    elif pred_payments[index]==0 and data_collection_train_payments[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0    
for i in data_collection_train_location[34:]:
    if pred_location[index]==1 and data_collection_train_location[index+34]==1:
        TP +=1
    elif pred_location[index]==0 and data_collection_train_location[index+34]==0:
        TN +=1
    elif pred_location[index]==0 and data_collection_train_location[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0    
for i in data_collection_train_IP[34:]:
    if pred_IP[index]==1 and data_collection_train_IP[index+34]==1:
        TP +=1
    elif pred_IP[index]==0 and data_collection_train_IP[index+34]==0:
        TN +=1
    elif pred_IP[index]==0 and data_collection_train_IP[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
index = 0    
for i in data_collection_train_passward[34:]:
    if pred_passward[index]==1 and data_collection_train_passward[index+34]==1:
        TP +=1
    elif pred_passward[index]==0 and data_collection_train_passward[index+34]==0:
        TN +=1
    elif pred_passward[index]==0 and data_collection_train_passward[index+34]==1:
        FN +=1
    else:
        FP +=1
    index +=1
    
recall = TP/(TP+FN)
Precision = TP/(TP+FP)
Accuracy = (TP+TN)/(TP+FP+FN+TN) 
F1 = (TP)/(TP+(0.5*(FP+FN)))

print("recall:" , recall)
print("Precision:" , Precision)
print("Accuracy:" , Accuracy)
print("F1 Score:" , F1)

recall: 0.8705882352941177
Precision: 0.8409090909090909
Accuracy: 0.7916666666666666
F1 Score: 0.8554913294797688
