In [1]:
import pandas as pd
from pandas import json_normalize
import yaml
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats
from scipy.stats import norm
import statsmodels.api as sm

import nltk

import sys
from collections import defaultdict
from collections import Counter

import ds_utils_callum
import priv_policy_manipulation_functions as priv_pol_funcs

In [2]:
modelling_df = pd.read_csv('modelling_df.csv', sep = "ª", engine='python')

In [3]:
modelling_df.head()

Unnamed: 0,source_policy_number,policy_type,contains_synthetic,policy_segment_id,sentence_text,Contact_1stParty,Contact_3rdParty,Contact_Address_Book_1stParty,Contact_Address_Book_3rdParty,Contact_City_1stParty,...,Location_Bluetooth_1stParty,Location_Bluetooth_3rdParty,Location_Cell_Tower_1stParty,Location_Cell_Tower_3rdParty,Location_GPS_1stParty,Location_GPS_3rdParty,Location_IP_Address_1stParty,Location_IP_Address_3rdParty,Location_WiFi_1stParty,Location_WiFi_3rdParty
0,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,TEST,False,2,"IP addresses will be collected, along with inf...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,TEST,False,2,The information that our products collect incl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,TEST,False,2,"When you visit our products, our servers autom...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,TEST,False,2,"IP ADDRESS, COOKIES, AND WEB BEACONS",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
modelling_df['contains_synthetic'].value_counts()

False    4479
True     3919
Name: contains_synthetic, dtype: int64

# Preprocessing

### Baseline train/test split

In [29]:
modelling_df['policy_type'].value_counts()

TRAINING      4264
TEST          2400
VALIDATION    1734
Name: policy_type, dtype: int64

In [15]:
modelling_df[modelling_df['policy_type'] == "TEST"]

(2400, 63)

In [18]:
test_df = modelling_df[modelling_df['policy_type'] == "TEST"].copy()

In [34]:
baseline_train_df = modelling_df[
    (modelling_df['policy_type'] == "TRAINING") | (modelling_df['policy_type'] == "VALIDATION")].copy()

In [35]:
baseline_train_df['policy_type'].value_counts()

TRAINING      4264
VALIDATION    1734
Name: policy_type, dtype: int64

In [44]:
baseline_train_df.shape

(5998, 63)

In [41]:
baseline_target = baseline_train_df[
    ['Contact_E_Mail_Address_1stParty', 'Contact_E_Mail_Address_3rdParty', 'Contact_Address_Book_1stParty', 
     'Contact_Postal_Address_3rdParty', 'Location_1stParty']
].copy() # selecting the current target columns, aka, practices, of interest

In [42]:
baseline_target.shape

(5998, 5)

In [43]:
baseline_target.head(3)

Unnamed: 0,Contact_E_Mail_Address_1stParty,Contact_E_Mail_Address_3rdParty,Contact_Address_Book_1stParty,Contact_Postal_Address_3rdParty,Location_1stParty
2400,0,0,0,0,0
2401,1,0,0,0,0
2402,0,0,0,0,0


### Tokenize & Vectorize

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# nltk.download('stopwords') # must download the stopwords if not already done so
from nltk.corpus import stopwords 
nltkstopwords = stopwords.words('english')

In [9]:
stemmer = nltk.stem.PorterStemmer()

In [12]:
bagofwords = CountVectorizer(stop_words=nltkstopwords,
                            tokenizer=lambda sentence: [ stemmer.stem(word) for word in sentence.split(' ') ])

In [36]:
bagofwords.fit(baseline_train_df['sentence_text'])
baseline_train_tokens = bagofwords.transform(baseline_train_df['sentence_text']) 
print(baseline_train_tokens.shape)
print(type(baseline_train_tokens))



(5998, 6965)
<class 'scipy.sparse.csr.csr_matrix'>


### Fit logistic regression

In [37]:
from sklearn.linear_model import LogisticRegression 

In [46]:
logistic_regression_model = LogisticRegression(max_iter = 2000, multi_class='ovr')

In [51]:
logistic_regression_model.fit(baseline_train_tokens, baseline_target.iloc[:,0]) # class 1

LogisticRegression(max_iter=2000, multi_class='ovr')

In [52]:
logistic_regression_model.score(baseline_train_tokens, baseline_target.iloc[:,0]) # class 1
# logistic_regression_model.score(X_test, y_test)

0.9514838279426475

In [56]:
y_predictions = logistic_regression_model.predict(baseline_train_tokens)
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
cf_matrix = confusion_matrix(baseline_target.iloc[:,0], y_predictions)
cf_df = pd.DataFrame(
    cf_matrix, columns=["Predicted Positive", "Predicted Negative"], index=["True Positive", "True Negative"])
cf_df

Unnamed: 0,Predicted Positive,Predicted Negative
True Positive,4412,48
True Negative,243,1295


In [57]:
from sklearn.metrics import classification_report
print(classification_report(baseline_target.iloc[:,0], y_predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4460
           1       0.96      0.84      0.90      1538

    accuracy                           0.95      5998
   macro avg       0.96      0.92      0.93      5998
weighted avg       0.95      0.95      0.95      5998



In [58]:
logistic_regression_model.coef_

array([[ 0.93831349, -0.18798157, -0.09399079, ...,  0.51225195,
        -0.15180708, -0.10868898]])