In [381]:
import pandas as pd
from pandas import json_normalize
import yaml
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats
from scipy.stats import norm

import sys
from collections import defaultdict
from collections import Counter
import re

import ds_utils_callum
import priv_policy_manipulation_functions as priv_pol_funcs

Story et al. (2019) "Natural Language Processing for Mobile App Privacy Compliance", available from https://usableprivacy.org/publications

# Confirm all cells are strings

In [382]:
clean_segment_annots_df = pd.read_pickle('segment_annots_df.pkl')

In [383]:
def column_all_dtype(dataframe_column, dtype):
    """
    Example inputs for dtype: str, "<class 'numpy.int64'>", "<class 'numpy.bool_'>"
    """
    for _index in range(len(dataframe_column)):
        if str(type(dataframe_column[_index])) != str(dtype):
            return False
    return True
# validation on columns of known type:
print(column_all_dtype(clean_segment_annots_df['policy_segment_id'], "<class 'numpy.int64'>") ) # should return True
print(column_all_dtype(clean_segment_annots_df['policy_segment_id'], str) ) # should return False
print(column_all_dtype(clean_segment_annots_df['contains_synthetic'], "<class 'numpy.bool_'>") ) # should return True

True
False
True


Confirming all segment text is str:

In [384]:
column_all_dtype(clean_segment_annots_df['segment_text'], str)

True

# Normalize Whitespace

Using `new_string = " ".join(old_string.split())`.  The `.split()` function considers a range of forms of whitespace.

I want to check the length of all the text before and after to see any difference made.

In [385]:
# check total length
def total_length(dataframe):
    total_length_all_segments = 0
    for _index in range(len(dataframe)):
        total_length_all_segments += len(dataframe.at[_index, "segment_text"])
    return f"Total length of all segments is {total_length_all_segments}"

In [386]:
def normalize_whitespace(dataframe):
    """
    Removed whitespace from all cells in the "segment_text" column.
    For verification, prints the total segment length before and after.
    Input: Dataframe with a column called "segment_text" that contains strings for the whitespace to be removed.
    """
    
    print(total_length(dataframe))
    
    # normalize whitespace
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = " ".join(dataframe.at[_index, "segment_text"].split())
    
    #verify change
    print(total_length(dataframe))

In [387]:
normalize_whitespace(clean_segment_annots_df)

Total length of all segments is 5917918
Total length of all segments is 5917918


Suspiciously nothing changed, so I will verify my function before concluding that the privacy policy segments have no whitespace in them.

Verifying the function by testing it on some whitespace.
- create  a dataframe where I have intentionally added whitespace 
- calling the function on this dataframe
- confirming the whitespace is removed

In [388]:
text_with_space = 'PRIVACY                                          This.'
verify_whitespace_df = clean_segment_annots_df.copy()
verify_whitespace_df.loc[0, 'segment_text'] = text_with_space
normalize_whitespace(verify_whitespace_df)

Total length of all segments is 5917320
Total length of all segments is 5917279


I have verified that my function works and so can conclude that the privacy policy segments have no whitespace in them.

# Normalize punctuation

First let's investigate whether are any unusual characters that could 

I took the below function from this towardsdatascience article [here](https://towardsdatascience.com/text-normalization-7ecc8e084e31).

In [389]:
def _simplify_punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1', corrected)
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

In [390]:
def remove_duplicate_punctuation(dataframe):
    print(total_length(dataframe))
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = _simplify_punctuation(dataframe.at[_index, "segment_text"])
    print(total_length(dataframe))

In [391]:
remove_duplicate_punctuation(clean_segment_annots_df)

Total length of all segments is 5917918
Total length of all segments is 5917879


Only a small number of characters were removed as expected.

Further punctuation normalization such as converting other characters to their english standardized versions (e.g. the opening speachmark “ to ", or elipses … to ...) would be ideal, but the ommission of this should not affect the sentence filtering much, and won't have any effect on the tf-idf matrix, because it ignores punctuation.

# Remove non-ASCII characters

This can be done by checking whether each character has a unicode index below 128, as ASCII characters are coded above 128.  Checking the unicode 'code point' is done with `ord(char)`.

In [392]:
def remove_non_ascii(string):
    """
    I found this function on this website: https://bobbyhadz.com/blog/python-remove-non-ascii-characters-from-string
    """
    return ''.join(char for char in string if ord(char) < 128)

# demonstrate function:
print(remove_non_ascii('a€bñcá')) # >> 'abc'
print(remove_non_ascii('a_b^0')) # >> a_b^0

abc
a_b^0


In [393]:
def remove_nonASCII_chars(dataframe):
    print(total_length(dataframe))
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = remove_non_ascii(dataframe.at[_index, "segment_text"])
    print(total_length(dataframe))

In [394]:
remove_nonASCII_chars(clean_segment_annots_df)

Total length of all segments is 5917879
Total length of all segments is 5901658


Thousands of characters were removed, representing nearly .3% of all characters.

# Make all policy text lowercase

In [395]:
def convert_to_lowercase(dataframe):

    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = dataframe.at[_index, "segment_text"].lower()

# verify
sample_df = pd.DataFrame(["ifiUFIWUNFIijnf"], columns=["segment_text"])
convert_to_lowercase(sample_df)
display(sample_df)

Unnamed: 0,segment_text
0,ifiufiwunfiijnf


In [396]:
convert_to_lowercase(clean_segment_annots_df)
clean_segment_annots_df['segment_text'].head(3) # verify

0    privacy policy this privacy policy (hereafter ...
1    1. about our products 1.1 our products offer a...
2    2. the information we collect the information ...
Name: segment_text, dtype: object

It can be seen that the text has been changed to lowercase.

**Save the above result**

In [397]:
clean_segment_annots_df.to_pickle('clean_segment_annots_df.pkl')

# Same pre-processing steps for Annotation Features

When populating the dataframe with crafted features, the dataframe of `annotation features` will be referred to, but it is possible that it is not formatted in the same way, so I will check the format of that too.

I doubt it will have any non-ASCII characters so I will just check which non-lowercase letters there are.

In [398]:
# Load the list of all the crafted features
annotation_features = pd.read_pickle('annotation_features.pkl')
list_all_crafted_features = [feature for row in annotation_features['features'] for feature in row]
len(list_all_crafted_features) # verify – should be 579 crafted features

579

In [399]:
list_of_chars = []
for ft in list_all_crafted_features:
    for char in ft:
        if char not in "abcdefghijklmnopqrstuvwxyz":
            if char not in list_of_chars:
                list_of_chars.append(char)
print(list_of_chars)

[' ', '.', ',', '-', '\xa0', '/', 'S', 'N', 'U', 'T', 'P', 'A', '(', ')', 'I', "'"]


By inspecting this list I can see that the only characters that I don't expect are the uppercase letters and "\xa0", which represents a type of whitespace.

I also noticed while manually browsing the features that Bluetooth was not listed because it had been incorrectly entered as 'bluethooth', so I will correct that now too.

In [400]:
list_all_crafted_features = [feature for row in annotation_features['features'] for feature in row]
"bluethooth" in list_all_crafted_features

True

In [401]:
for _row in range(len(annotation_features)):
    crafted_feature_list = annotation_features.at[_row, 'features']

    new_crafted_feature_list = ["bluetooth" if feature=="bluethooth" else feature for feature in crafted_feature_list]
    new_crafted_feature_list = [feature.lower() for feature in new_crafted_feature_list]
    new_crafted_feature_list = [" ".join(feature.split()) for feature in new_crafted_feature_list]

    
    annotation_features.at[_row, 'features'] = new_crafted_feature_list

In [402]:
list_all_crafted_features = [feature for row in annotation_features['features'] for feature in row]
"bluethooth" in list_all_crafted_features

False

In [403]:
print(len(list_all_crafted_features)) # verify – should be 579 crafted features
list_of_chars = []
for ft in list_all_crafted_features:
    for character in ft:
        if character not in "abcdefghijklmnopqrstuvwxyz":
            if character not in list_of_chars:
                list_of_chars.append(character)
list_of_chars

579


[' ', '.', ',', '-', '/', '(', ')', "'"]

All problematic characters are now removed so this list of features can be used for modelling.

In [404]:
annotation_features.to_pickle('clean_annotation_features.pkl')
confirm_save_0 = pd.read_pickle('clean_annotation_features.pkl')
print(annotation_features.shape == confirm_save_0.shape)
print(confirm_save_0.equals(annotation_features))

True
True


---

# Append annotation features to dataframe

The next steps are to:
- 1. Append each feature as a column to the dataframe with the annotation as a prefix
- 2. Populate the columns using the segment

Then I can move to modelling.

I already have a function to help with 1 called `add_empty_annotation_columns`.  I just need to put the new features into a list.

First I want to check whether any of the features are the same.

In [405]:
clean_annotation_features = pd.read_pickle('clean_annotation_features.pkl')
clean_segment_annots_df = pd.read_pickle('clean_segment_annots_df.pkl')

In [406]:
list_all_crafted_features = [feature for row in clean_annotation_features['features'] for feature in row]

In [407]:
all_features = []
duplicate_features = []
for feature in list_all_crafted_features:
    if feature in all_features:
        duplicate_features.append(feature)
    all_features.append(feature)
len(duplicate_features)

103

Hmm a lot of the features are exactly the same.  I'm not sure that this will be an issue though – the only step being done while a dataframe with multiple same column names is populating it, which I can do by looping through the columns one at a time, but this will be worth keeping in mind.  Ideally I would clean it up as some pandas functions are unavailable if the dataframe has multiple column names with the same name.

## Add crafted features columns to df

In [408]:
print(len(list_all_crafted_features))
print(clean_segment_annots_df.shape)
crafted_features_df = priv_pol_funcs.add_empty_annotation_columns(clean_segment_annots_df, list_all_crafted_features) 

579
(15543, 41)
The shape of the returned dataframe is (15543, 620)


In [409]:
crafted_features_df.iloc[:,40:].head(2)

Unnamed: 0,NOT_PERFORMED,contact info,contact details,contact data,"e.g., your name",contact you,your contact,"identify, contact",identifying information,"your name, address, and e-mail address",...,never be acquired,never be viewed,never be located,never be asked,never be utilized,never be requested,never be transmitted,never be communicated,nor do we collect,does not tell us
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can see that the crafted features are all columns from column 41 to the end.  Now let's remove the duplicate feature columns before populating them all. I expect 103 columns to be removed to bring the `crafted_features_df` down to 517 columns.

In [410]:
crafted_features_df = crafted_features_df.loc[:,~crafted_features_df.columns.duplicated()] # remove columns with duplicate names
crafted_features_df.shape

(15543, 517)

To populate the crafted features columns, I will:

- Take the column name for each crafted feature
- take the segment text for each row
- if column name in segment text: put 1.

**This may take some time to run!!**

In [411]:
%%time

all_rows = range(len(crafted_features_df)) # index of rows to loop through

for column_number in range(41, 517): # Looping through each column with a feature

    column_name = crafted_features_df.columns[column_number] # for that column feature

    for row in all_rows: # and for every row
        if column_name in crafted_features_df.at[row, "segment_text"]: # if the segment has that feature
            crafted_features_df.at[row, column_name] = 1 # make the value for that feature on that row equal 1
    
    print(f"Processing {column_number}/517", end="\r")

CPU times: user 28.1 s, sys: 188 ms, total: 28.3 s
Wall time: 28.5 s


In [412]:
# looking at some of the results to verify
summations = crafted_features_df.iloc[:,41:].sum()
print(f"{(summations==0).sum()} features have not been populated")

133 features have not been populated


This seems like a lot of empty columns, so I manually looked through the results, as well as checking the source text, and found that most of the crafted feature columns that haven't been populated are generally:
- unusual ways of typing a phrase (example: 'post code' instead of postcode)
- specific phrases for uncommon data practices (example: 'exact device location')
- negative phrases (example: never be requested)

Overall this looks roughly correct so I will use it for modelling.

## Saving the df

As before, to make it faster to load this dataframe in this notebook and others, I will save this dataframe as a pickle file.  This allows the below code to be ran without waiting for the above code.

In [413]:
crafted_features_df.to_pickle('crafted_features_df.pkl')

Verifying that the file was correctly saved and can be imported properly:

In [414]:
confirm_save_5 = pd.read_pickle('crafted_features_df.pkl')
print(crafted_features_df.shape == confirm_save_5.shape)
print(confirm_save_5.equals(crafted_features_df))

True
True
