In [2]:
import pandas as pd
from pandas import json_normalize
import yaml
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats
from scipy.stats import norm
import statsmodels.api as sm

import nltk

import sys
from collections import defaultdict
from collections import Counter

import ds_utils_callum
import priv_policy_manipulation_functions as priv_pol_funcs

In [3]:
modelling_df = pd.read_csv('modelling_df.csv', sep = "ª", engine='python')

In [4]:
modelling_df.head()

Unnamed: 0,source_policy_number,policy_type,contains_synthetic,policy_segment_id,sentence_text,Contact_1stParty,Contact_3rdParty,Contact_Address_Book_1stParty,Contact_Address_Book_3rdParty,Contact_City_1stParty,...,Location_Bluetooth_1stParty,Location_Bluetooth_3rdParty,Location_Cell_Tower_1stParty,Location_Cell_Tower_3rdParty,Location_GPS_1stParty,Location_GPS_3rdParty,Location_IP_Address_1stParty,Location_IP_Address_3rdParty,Location_WiFi_1stParty,Location_WiFi_3rdParty
0,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,TEST,False,2,"IP addresses will be collected, along with inf...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,TEST,False,2,The information that our products collect incl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,TEST,False,2,"When you visit our products, our servers autom...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
modelling_df['contains_synthetic'].value_counts()

False    4479
True     3919
Name: contains_synthetic, dtype: int64

# Preprocessing

### Baseline train/test split

In [6]:
modelling_df['policy_type'].value_counts()

TRAINING      4264
TEST          2400
VALIDATION    1734
Name: policy_type, dtype: int64

In [75]:
modelling_df['policy_type'].value_counts(True)

TRAINING      0.507740
TEST          0.285782
VALIDATION    0.206478
Name: policy_type, dtype: float64

In [7]:
modelling_df[modelling_df['policy_type'] == "TEST"]

Unnamed: 0,source_policy_number,policy_type,contains_synthetic,policy_segment_id,sentence_text,Contact_1stParty,Contact_3rdParty,Contact_Address_Book_1stParty,Contact_Address_Book_3rdParty,Contact_City_1stParty,...,Location_Bluetooth_1stParty,Location_Bluetooth_3rdParty,Location_Cell_Tower_1stParty,Location_Cell_Tower_3rdParty,Location_GPS_1stParty,Location_GPS_3rdParty,Location_IP_Address_1stParty,Location_IP_Address_3rdParty,Location_WiFi_1stParty,Location_WiFi_3rdParty
0,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,TEST,False,2,"IP addresses will be collected, along with inf...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,TEST,False,2,The information that our products collect incl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,TEST,False,2,"When you visit our products, our servers autom...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,100,TEST,False,53,You can disable pixel tags by changing your br...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2396,100,TEST,False,56,Location Based Services.,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2397,100,TEST,False,56,You may opt-out of location-based services at ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2398,100,TEST,False,59,You represent and warrant that you are the own...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
test_df = modelling_df[modelling_df['policy_type'] == "TEST"].copy()

In [9]:
baseline_train_df = modelling_df[
    (modelling_df['policy_type'] == "TRAINING") | (modelling_df['policy_type'] == "VALIDATION")].copy()

In [10]:
baseline_train_df['policy_type'].value_counts()

TRAINING      4264
VALIDATION    1734
Name: policy_type, dtype: int64

In [11]:
baseline_train_df.shape

(5998, 63)

In [50]:
baseline_train_target = baseline_train_df[
    ['Contact_E_Mail_Address_1stParty', 'Contact_E_Mail_Address_3rdParty', 'Contact_Address_Book_1stParty', 
     'Contact_Postal_Address_3rdParty', 'Location_1stParty']
].copy() # selecting the current target columns, aka, practices, of interest

In [51]:
baseline_train_target.shape

(5998, 5)

In [52]:
baseline_train_target.head(3)

Unnamed: 0,Contact_E_Mail_Address_1stParty,Contact_E_Mail_Address_3rdParty,Contact_Address_Book_1stParty,Contact_Postal_Address_3rdParty,Location_1stParty
2400,0,0,0,0,0
2401,1,0,0,0,0
2402,0,0,0,0,0


In [53]:
y_train_Contact_E_Mail_Address_1stParty = baseline_train_target.iloc[:,0]

In [54]:
y_train_Contact_E_Mail_Address_3rdParty = baseline_train_target.iloc[:,1]

In [55]:
y_train_Contact_Address_Book_1stParty = baseline_train_target.iloc[:,2]

In [56]:
y_train_Contact_Postal_Address_3rdParty = baseline_train_target.iloc[:,3]

In [57]:
y_train_Location_1stParty = baseline_train_target.iloc[:,4]

In [95]:
list_of_train_targets = [y_train_Contact_E_Mail_Address_1stParty, y_train_Contact_E_Mail_Address_3rdParty, 
                         y_train_Contact_Address_Book_1stParty, y_train_Contact_Postal_Address_3rdParty, 
                         y_train_Location_1stParty]

In [58]:
test_target = test_df[
    ['Contact_E_Mail_Address_1stParty', 'Contact_E_Mail_Address_3rdParty', 'Contact_Address_Book_1stParty', 
     'Contact_Postal_Address_3rdParty', 'Location_1stParty']
].copy()

In [61]:
y_test_Contact_E_Mail_Address_1stParty = test_target.iloc[:,0]
y_test_Contact_E_Mail_Address_3rdParty = test_target.iloc[:,1]
y_test_Contact_Address_Book_1stParty = test_target.iloc[:,2]
y_test_Contact_Postal_Address_3rdParty = test_target.iloc[:,3]
y_test_Location_1stParty = test_target.iloc[:,4]
list_of_test_targets = ['y_test_Contact_E_Mail_Address_1stParty', 'y_test_Contact_E_Mail_Address_3rdParty', 
                        'y_test_Contact_Address_Book_1stParty', 'y_test_Contact_Postal_Address_3rdParty', 
                        'y_test_Location_1stParty']

In [98]:
list_of_target_variables = [y_test_Contact_E_Mail_Address_1stParty, y_test_Contact_E_Mail_Address_3rdParty, 
                           y_test_Contact_Address_Book_1stParty, y_test_Contact_Postal_Address_3rdParty, 
                           y_test_Location_1stParty]

### Tokenize & Vectorize

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# nltk.download('stopwords') # must download the stopwords if not already done so
from nltk.corpus import stopwords 
nltkstopwords = stopwords.words('english')

In [17]:
stemmer = nltk.stem.PorterStemmer()

In [18]:
bagofwords = CountVectorizer(stop_words=nltkstopwords,
                            tokenizer=lambda sentence: [ stemmer.stem(word) for word in sentence.split(' ') ])

In [66]:
bagofwords.fit(baseline_train_df['sentence_text'])
baseline_train_tokens = bagofwords.transform(baseline_train_df['sentence_text']) 
test_tokens = bagofwords.transform(test_df['sentence_text']) 
print(baseline_train_tokens.shape)
print(type(baseline_train_tokens))
print(test_tokens.shape)



(5998, 6965)
<class 'scipy.sparse.csr.csr_matrix'>
(2400, 6965)


### Fit logistic regression

In [20]:
from sklearn.linear_model import LogisticRegression 

In [21]:
logistic_regression_model = LogisticRegression(max_iter = 2000, multi_class='ovr')

In [32]:
logistic_regression_model.fit(baseline_train_tokens, y_train_Contact_E_Mail_Address_1stParty) # class 1

LogisticRegression(max_iter=2000, multi_class='ovr')

In [33]:
logistic_regression_model.score(baseline_train_tokens, y_train_Contact_E_Mail_Address_1stParty) # class 1
# logistic_regression_model.score(X_test, y_test)

0.9514838279426475

In [37]:
y_predictions = logistic_regression_model.predict(baseline_train_tokens)
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
cf_matrix = confusion_matrix(y_train_Contact_E_Mail_Address_1stParty, y_predictions)
cf_df = pd.DataFrame(
    cf_matrix, columns=["Predicted Negative", "Predicted Positive"], index=["True Negative", "True Positive"])
cf_df

Unnamed: 0,Predicted Negative,Predicted Positive
True Negative,4412,48
True Positive,243,1295


In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_train_Contact_E_Mail_Address_1stParty, y_predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4460
           1       0.96      0.84      0.90      1538

    accuracy                           0.95      5998
   macro avg       0.96      0.92      0.93      5998
weighted avg       0.95      0.95      0.95      5998



In [36]:
logistic_regression_model.coef_

array([[ 0.93831349, -0.18798157, -0.09399079, ...,  0.51225195,
        -0.15180708, -0.10868898]])

**Classes are all imbalanced so I'm especially interested in recall since it is likely to perform lower, but precision would be good too since I don't want it to say there is a practice when there isn't!**

In [69]:
y_test_email_1st_preds = logistic_regression_model.predict(test_tokens)

In [70]:
logistic_regression_model.score(test_tokens, y_test_Contact_E_Mail_Address_1stParty)

0.87375

In [71]:
print(classification_report(y_test_Contact_E_Mail_Address_1stParty, y_test_email_1st_preds))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1825
           1       0.80      0.63      0.70       575

    accuracy                           0.87      2400
   macro avg       0.85      0.79      0.81      2400
weighted avg       0.87      0.87      0.87      2400



In [79]:
from sklearn.metrics import recall_score, precision_score
print(f"Email Address Recall: {recall_score(y_test_Contact_E_Mail_Address_1stParty, y_test_email_1st_preds)}")
print(precision_score(y_test_Contact_E_Mail_Address_1stParty, y_test_email_1st_preds))

Email Address Recall: 0.6278260869565218
0.8022222222222222


In [None]:
def run_model_and_evaluate()

In [93]:
list_of_test_targets

['y_test_Contact_E_Mail_Address_1stParty',
 'y_test_Contact_E_Mail_Address_3rdParty',
 'y_test_Contact_Address_Book_1stParty',
 'y_test_Contact_Postal_Address_3rdParty',
 'y_test_Location_1stParty']

In [None]:
def get_predictions(target):
    
    logistic_regression_model.fit(baseline_train_tokens, y_train_Contact_E_Mail_Address_1stParty)
    
    return logistic_regression_model.predict(test_tokens)

In [97]:
list_of_train_targets[1]

2400    0
2401    0
2402    0
2403    0
2404    0
       ..
8393    0
8394    0
8395    0
8396    0
8397    0
Name: Contact_E_Mail_Address_3rdParty, Length: 5998, dtype: int64

In [108]:
precision_list = []
recall_list = []
for i in range(5):
    logistic_regression_model.fit(baseline_train_tokens, list_of_train_targets[i])
    predics = logistic_regression_model.predict(test_tokens)
    prec = precision_score(list_of_target_variables[i], predics)
    recs = recall_score(list_of_target_variables[i], predics)
    precision_list.append(prec)
    recall_list.append(recs)

In [121]:
precision_list = [round(item, 3) for item in precision_list]
precision_list

[0.802, 0.706, 0.66, 0.857, 0.796]

In [109]:
recall_list = [round(item, 3) for item in recall_list]
recall_list

[0.628, 0.197, 0.376, 0.162, 0.558]

In [None]:
recall_list = []
for i in range(5):
    logistic_regression_model.fit(baseline_train_tokens, list_of_train_targets[i])
    predics = logistic_regression_model.predict(test_tokens)
    prec = precision_score(list_of_target_variables[i], predics)
    precision_list.append(prec)

In [122]:
prec_rec = np.array([precision_list, recall_list])

In [123]:
prec_rec.T

array([[0.802, 0.628],
       [0.706, 0.197],
       [0.66 , 0.376],
       [0.857, 0.162],
       [0.796, 0.558]])

In [124]:
eval_scores = pd.DataFrame(columns=['Precision', 'Recall'], index=list_of_targets_again, data=prec_rec.T )

In [125]:
eval_scores

Unnamed: 0,Precision,Recall
Contact_E_Mail_Address_1stParty,0.802,0.628
Contact_E_Mail_Address_3rdParty,0.706,0.197
Contact_Address_Book_1stParty,0.66,0.376
Contact_Postal_Address_3rdParty,0.857,0.162
Location_1stParty,0.796,0.558
