In [1]:
import numpy as np
import pandas as pd
import math
import statistics
import matplotlib.pyplot as plt
from transformers import pipeline
from sklearn.model_selection import train_test_split, cross_val_score,KFold, cross_val_predict, GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.datasets import make_classification
from sklearn.neural_network import MLPClassifier


In [2]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

## Download All Policies - Train Set

In [3]:
##Privacy Policies download:
websites = ['Google','Aws','AliExpress','Meta','TikTok','YouTube','Waze','Wix','Bookings','whatsapp'
            ,'apple','wolt','Visa','Mastercard','AirBNB','uber','Spotify','samsung','Wordpress','instagram'
            ,'MacDonalds','FDA','Oracle','Zara','cocacola','Xiaomi','Nasdaq','Walmart'
           ,'AirCanada','Lufthansa','shopify','Netflix','adobe','Starbucks','Shoppers','Decathlon','waltdisney'
            ,'AmericanEagle','lululemon','SAP','JetBrains','MySQLCode','Cadens','EpicGames'
            ,'unitedHealthGroup','Slack','SalesForce','JPMorgan','JohnsonAndJohnson']

sequence_to_classify = []

for i in websites:
    try:
        with open((i+".txt"), "r") as f:
            i = f.readlines()
            sequence_to_classify.append(i)
    except:
        with open((i+".txt"), "r", encoding='cp1252') as f:
            i = f.readlines()
            sequence_to_classify.append(i)

### Train Set Vectors:

In [4]:
first_party_use_train = [1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]


### First Party Transfer

In [5]:
##first_party_use Matrix:

Phrases = ['Privacy Policy'
           ,'the user information is being saved'
           ,'We use your personal information'
           ,'We collect and use your personal information'
           ,'We will use the information we collect'
           ,'We will use the information you provide']

##Matrix creation:
matrix_first_party_ML = pd.DataFrame(columns = Phrases)
matrix_first_party_ML['Privacy Policy'] = websites
matrix_first_party_ML.set_index('Privacy Policy', inplace=True)

##Filling the matrix:
line = 0
for j in sequence_to_classify:
    for i in matrix_first_party_ML:
        matrix_first_party_ML.loc[websites[line],i] = classifier(str(j), str(i))["scores"][0]
    line += 1
    
matrix_first_party_ML

Unnamed: 0_level_0,the user information is being saved,We use your personal information,We collect and use your personal information,We will use the information we collect,We will use the information you provide
Privacy Policy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Google,0.973002,0.98327,0.991912,0.982043,0.97709
Aws,0.929998,0.95334,0.966737,0.938838,0.940077
AliExpress,0.7962,0.859369,0.950822,0.904863,0.898119
Meta,0.969938,0.982271,0.989877,0.980696,0.978814
TikTok,0.877115,0.913555,0.951839,0.92532,0.918506
YouTube,0.951257,0.972655,0.972255,0.947119,0.966858
Waze,0.911959,0.923871,0.960203,0.947474,0.956882
Wix,0.919279,0.978406,0.987691,0.962101,0.946619
Bookings,0.981227,0.989663,0.995268,0.991088,0.988813
whatsapp,0.881537,0.887173,0.866845,0.809817,0.844999


### Test The Model - ML

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(matrix_policy_change, policy_change_train
#                                                     , test_size=0.5, random_state=42)

# Manual division so we will have all options (1/0) at both groups
X_train = matrix_first_party_ML.iloc[:35,:] 
X_test = matrix_first_party_ML.iloc[34:,:]
y_train = first_party_use_train[:35]
y_test = first_party_use_train[34:]

##Decission Tree Classifaier
regressor_tree = DecisionTreeClassifier()
regressor_tree = regressor_tree.fit(X_train, y_train)
pred_tree = regressor_tree.predict(X_test)
print('Decission Tree Classifaier:', pred_tree)
print(classification_report(y_test, pred_tree))

##Random Forest Classifaier
regressor_forest = RandomForestClassifier()
regressor_forest = regressor_forest.fit(X_train, y_train)
pred_forest = regressor_forest.predict(X_test).round()
print('Random Forest Classifaier:', pred_forest)
print(classification_report(y_test, pred_forest))

##Logistic Reg Classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
print('Logistic Reg Classifier:', pred_clf)
print(classification_report(y_test, pred_clf))

##SVM
SVM = SVC(kernel='linear')
SVM = SVM.fit(X_train, y_train)
pred_SVM = SVM.predict(X_test)
print('SVM:', pred_SVM)
print(classification_report(y_test, pred_SVM))

##KNN
KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
KNN = KNN.fit(X_train, y_train)
pred_KNN = KNN.predict(X_test)
print('KNN:', pred_KNN)
print(classification_report(y_test, pred_KNN))

##neural_network
neural_network = MLPClassifier(random_state=1, max_iter=300)
neural_network = neural_network.fit(X_train, y_train)
pred_neural_network = neural_network.predict(X_test)
print('neural_network:', pred_neural_network)
print(classification_report(y_test, pred_neural_network))

Decission Tree Classifaier: [1 1 1 0 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.93      0.97        15

    accuracy                           0.93        15
   macro avg       0.50      0.47      0.48        15
weighted avg       1.00      0.93      0.97        15

Random Forest Classifaier: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Logistic Reg Classifier: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weight

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

neural_network: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



In [7]:
##first_party use Matrix:

##Matrix creation:
matrix_first_party_Comparison = pd.DataFrame(columns = ['Phrase',
                                               'Google','Aws','AliExpress','Meta','TikTok','YouTube','Waze','Wix','Bookings','whatsapp'
            ,'apple','wolt','Visa','Mastercard','AirBNB','uber','Spotify','samsung','Wordpress','instagram'
            ,'MacDonalds','FDA','Oracle','Zara','cocacola','Xiaomi','Nasdaq','Walmart'
           ,'AirCanada','Lufthansa','shopify','Netflix','adobe','Starbucks','Shoppers','Decathlon','waltdisney'
            ,'AmericanEagle','lululemon','SAP','JetBrains','MySQLCode','Cadens','EpicGames'
            ,'unitedHealthGroup','Slack','SalesForce','JPMorgan','JohnsonAndJohnson'])
matrix_first_party_Comparison['Phrase'] = ['First Party Use']

##Filling the matrix:
websites_index = 0
line = 0
pred = 0
first_party_train_pred_Comparison = []
c = ['We use your personal information','We are not using your personal information']

for j in sequence_to_classify:
    a = classifier(str(j),c)
    if a["labels"][0]==c[0] :#and a["scores"][0]>0.6:
        pred = 1
    else:
        pred = 0  
    matrix_first_party_Comparison[websites[websites_index]].loc[line] = pred
    first_party_train_pred_Comparison.append(pred)
    websites_index += 1
    pred = 0

first_party_train_pred_Comparison

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [8]:
index = 0
TP = 0
FP = 0
FN = 0
TN = 0
for i in first_party_use_train:
    if first_party_train_pred_Comparison[index]==1 and first_party_use_train[index]==1:
        TP +=1
    elif first_party_train_pred_Comparison[index]==0 and first_party_use_train[index]==0:
        TN +=1
    elif first_party_train_pred_Comparison[index]==0 and first_party_use_train[index]==1:
        FN +=1
    else:
        FP +=1
    index +=1

recall = TP/(TP+FN)
specificity = TN/(TN+FP)
Precision = TP/(TP+FP)
Accuracy = (TP+TN)/(TP+FP+FN+TN) 
F1 = (TP)/(TP+(0.5*(FP+FN)))

print("recall:" , recall)
print("specificity:" , specificity)
print("Precision:" , Precision)
print("Accuracy:" , Accuracy)
print("F1 Score:" , F1)

recall: 1.0
specificity: 0.0
Precision: 0.9795918367346939
Accuracy: 0.9795918367346939
F1 Score: 0.9896907216494846


In [9]:
##Privacy Policy Change Matrix:

##Matrix creation:
matrix_first_party_Median = matrix_first_party_ML.transpose()

#add the avg and median rows:
matrix_first_party_Median.loc['Median'] =  matrix_first_party_Median.median()
matrix_first_party_Median.loc['Median']['Privacy Policy'] = 'Median'

## Print the matrix:
matrix_first_party_Median

first_party_train_pred_Median = []
websites_index = 0

for i in matrix_first_party_Median:
    num = matrix_first_party_Median[websites[websites_index]].loc['Median']
    if num >0.65:
        first_party_train_pred_Median.append(1)
    else:
        first_party_train_pred_Median.append(0)
    websites_index += 1
    
first_party_train_pred_Median

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [10]:
index = 0
TP = 0
FP = 0
FN = 0
TN = 0

for i in first_party_use_train:
    if first_party_train_pred_Median[index]==1 and first_party_use_train[index]==1:
        TP +=1
    elif first_party_train_pred_Median[index]==0 and first_party_use_train[index]==0:
        TN +=1
    elif first_party_train_pred_Median[index]==0 and first_party_use_train[index]==1:
        FN +=1
    else:
        FP +=1
    index +=1

recall = TP/(TP+FN)
Precision = TP/(TP+FP)
Accuracy = (TP+TN)/(TP+FP+FN+TN) 
F1 = (TP)/(TP+(0.5*(FP+FN)))

print("recall:" , recall)
print("Precision:" , Precision)
print("Accuracy:" , Accuracy)
print("F1 Score:" , F1)

recall: 1.0
Precision: 0.9795918367346939
Accuracy: 0.9795918367346939
F1 Score: 0.9896907216494846
