In [162]:
import pandas as pd
from pandas import json_normalize
import yaml
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats
from scipy.stats import norm

import sys
from collections import defaultdict
from collections import Counter
import re

import ds_utils_callum
import priv_policy_manipulation_functions as priv_pol_funcs

Story et al. (2019) "Natural Language Processing for Mobile App Privacy Compliance", available from https://usableprivacy.org/publications

# Confirm all cells are strings

# Read in a different df and name it something else

In [163]:
crafted_features_df = pd.read_pickle('crafted_features_df.pkl')

In [164]:
def column_all_dtype(dataframe_column, dtype):
    """
    Example inputs for dtype: str, "<class 'numpy.int64'>", "<class 'numpy.bool_'>"
    """
    for _index in range(len(dataframe_column)):
        if str(type(dataframe_column[_index])) != str(dtype):
            return False
    return True
# validation on columns of known type:
print(column_all_dtype(crafted_features_df['policy_segment_id'], "<class 'numpy.int64'>") ) # should return True
print(column_all_dtype(crafted_features_df['policy_segment_id'], str) ) # should return False
print(column_all_dtype(crafted_features_df['contains_synthetic'], "<class 'numpy.bool_'>") ) # should return True

True
False
True


Confirming all segment text is str:

In [165]:
column_all_dtype(crafted_features_df['segment_text'], str)

True

# Normalize Whitespace

Using `new_string = " ".join(old_string.split())`.  The `.split()` function considers a range of forms of whitespace.

I want to check the length of all the text before and after to see any difference made.

In [166]:
# check total length
def total_length(dataframe):
    total_length_all_segments = 0
    for _index in range(len(dataframe)):
        total_length_all_segments += len(dataframe.at[_index, "segment_text"])
    return f"Total length of all segments is {total_length_all_segments}"

In [167]:
def normalize_whitespace(dataframe):
    """
    Removed whitespace from all cells in the "segment_text" column.
    For verification, prints the total segment length before and after.
    Input: Dataframe with a column called "segment_text" that contains strings for the whitespace to be removed.
    """
    
    print(total_length(dataframe))
    
    # normalize whitespace
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = " ".join(dataframe.at[_index, "segment_text"].split())
    
    #verify change
    print(total_length(dataframe))

In [168]:
normalize_whitespace(crafted_features_df)

Total length of all segments is 5917918
Total length of all segments is 5917918


Suspiciously nothing changed, so I will verify my function before concluding that the privacy policy segments have no whitespace in them.

Verifying the function by testing it on some whitespace.
- create  a dataframe where I have intentionally added whitespace 
- calling the function on this dataframe
- confirming the whitespace is removed

In [169]:
text_with_space = 'PRIVACY                                          This.'
verify_whitespace_df = crafted_features_df.copy()
verify_whitespace_df.loc[0, 'segment_text'] = text_with_space
normalize_whitespace(verify_whitespace_df)

Total length of all segments is 5917320
Total length of all segments is 5917279


I have verified that my function works and so can conclude that the privacy policy segments have no whitespace in them.

# Normalize punctuation

First let's investigate whether are any unusual characters that could 

I took the below function from this towardsdatascience article [here](https://towardsdatascience.com/text-normalization-7ecc8e084e31).

In [171]:
def _simplify_punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1', corrected)
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

In [172]:
def remove_duplicate_punctuation(dataframe):
    print(total_length(dataframe))
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = _simplify_punctuation(dataframe.at[_index, "segment_text"])
    print(total_length(dataframe))

In [173]:
remove_duplicate_punctuation(crafted_features_df)

Total length of all segments is 5917918
Total length of all segments is 5917879


Only a small number of characters were removed as expected.

Further punctuation normalization such as converting other characters to their english standardized versions (e.g. the opening speachmark “ to ", or elipses … to ...) would be ideal, but the ommission of this should not affect the sentence filtering much, and won't have any effect on the tf-idf matrix, because it ignores punctuation.

# Remove non-ASCII characters

This can be done by checking whether each character has a unicode index below 128, as ASCII characters are coded above 128.  Checking the unicode 'code point' is done with `ord(char)`.

In [174]:
def remove_non_ascii(string):
    """
    I found this function on this website: https://bobbyhadz.com/blog/python-remove-non-ascii-characters-from-string
    """
    return ''.join(char for char in string if ord(char) < 128)

# demonstrate function:
print(remove_non_ascii('a€bñcá')) # >> 'abc'
print(remove_non_ascii('a_b^0')) # >> a_b^0

abc
a_b^0


In [175]:
def remove_nonASCII_chars(dataframe):
    print(total_length(dataframe))
    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = remove_non_ascii(dataframe.at[_index, "segment_text"])
    print(total_length(dataframe))

In [176]:
remove_nonASCII_chars(crafted_features_df)

Total length of all segments is 5917879
Total length of all segments is 5901658


Thousands of characters were removed, representing nearly .3% of all characters.

# Make all policy text lowercase

In [177]:
def convert_to_lowercase(dataframe):

    for _index in range(len(dataframe)):
        dataframe.at[_index, "segment_text"] = dataframe.at[_index, "segment_text"].lower()

# verify
sample_df = pd.DataFrame(["ifiUFIWUNFIijnf"], columns=["segment_text"])
convert_to_lowercase(sample_df)
display(sample_df)

Unnamed: 0,segment_text
0,ifiufiwunfiijnf


In [178]:
convert_to_lowercase(crafted_features_df)
crafted_features_df['segment_text'].head(3) # verify

0    privacy policy this privacy policy (hereafter ...
1    1. about our products 1.1 our products offer a...
2    2. the information we collect the information ...
Name: segment_text, dtype: object

It can be seen that the text has been changed to lowercase.

# Same pre-processing steps for Annotation Features

When populating the dataframe with crafted features, the dataframe of `annotation features` will be referred to, but it is possible that it is not formatted in the same way, so I will check the format of that too.

I doubt it will have any non-ASCII characters so I will just check which non-lowercase letters there are.

In [179]:
# Load the list of all the crafted features
annotation_features = pd.read_pickle('annotation_features.pkl')
list_all_crafted_features = [feature for row in annotation_features['features'] for feature in row]
len(list_all_crafted_features) # verify – should be 579 crafted features

579

In [180]:
list_of_chars = []
for ft in list_all_crafted_features:
    for char in ft:
        if char not in "abcdefghijklmnopqrstuvwxyz":
            if char not in list_of_chars:
                list_of_chars.append(char)
print(list_of_chars)

[' ', '.', ',', '-', '\xa0', '/', 'S', 'N', 'U', 'T', 'P', 'A', '(', ')', 'I', "'"]


By inspecting this list I can see that the only characters that I don't expect are the uppercase letters and "\xa0", which represents a type of whitespace.

In [181]:
for _row in range(len(annotation_features)):
    crafted_feature_list = annotation_features.at[_row, 'features']
    
    new_crafted_feature_list = [feature.lower() for feature in crafted_feature_list]
    new_crafted_feature_list = [" ".join(feature.split()) for feature in new_crafted_feature_list]
    
    annotation_features.at[_row, 'features'] = new_crafted_feature_list

In [182]:
list_all_crafted_features = [feature for row in annotation_features['features'] for feature in row]
print(len(list_all_crafted_features)) # verify – should be 579 crafted features
list_of_chars = []
for ft in list_all_crafted_features:
    for character in ft:
        if character not in "abcdefghijklmnopqrstuvwxyz":
            if character not in list_of_chars:
                list_of_chars.append(character)
list_of_chars

579


[' ', '.', ',', '-', '/', '(', ')', "'"]

All problematic characters are now removed so this list of features can be used for modelling.

In [183]:
annotation_features.to_pickle('clean_annotation_features.pkl')
confirm_save_0 = pd.read_pickle('clean_annotation_features.pkl')
print(annotation_features.shape == confirm_save_0.shape)
print(confirm_save_0.equals(annotation_features))

True
True
