EDA should be in the context of my data.  I should state what I expect BEFORE checking for it in the data.

Also at the top I can include links, context, goals

In [5]:
import pandas as pd
from pandas import json_normalize
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats
from scipy.stats import norm
import statsmodels.api as sm
import yaml
import sys
from collections import defaultdict
from collections import Counter

import ds_utils_callum
import priv_policy_manipulation_functions

Put the first policy into a df

## Populating top-level df

In [17]:
def load_all_policies():
    
    # Load the first policy into a dataframe
    with open("APP_350_v1_1/annotations/policy_1.yml", "r") as stream:
        try:
            all_policies_df = (json_normalize(yaml.safe_load(stream)))
        except yaml.YAMLError as exc:
            print(exc)
        
    # Get the locations of all policy files
    full_policy_list = [f"APP_350_v1_1/annotations/policy_{num}.yml" for num in range(2,351)]
    
    #Loop through all the policy file addresses, normalise it and add it to the bottom of all_policies_df
    for document in full_policy_list:
        with open(document, "r") as stream:
            try:
                current_df_policy_info = (json_normalize(yaml.safe_load(stream)))
                all_policies_df = pd.concat([all_policies_df, current_df_policy_info], axis=0)
            except yaml.YAMLError as exc:
                print(exc)
    
    all_policies_df.reset_index(drop=True, inplace=True)
    
    return all_policies_df

In [18]:
all_policies_df = load_all_policies()
all_policies_df.head(5)

Unnamed: 0,policy_id,policy_name,policy_type,contains_synthetic,segments
0,1,6677G,TEST,False,"[{'segment_id': 0, 'segment_text': 'PRIVACY PO..."
1,2,AIFactory,TEST,False,"[{'segment_id': 0, 'segment_text': 'AI Factory..."
2,3,AppliqatoSoftware,TEST,False,"[{'segment_id': 0, 'segment_text': 'Automatic ..."
3,4,BandaiNamco,TEST,False,"[{'segment_id': 0, 'segment_text': 'MOBILE APP..."
4,5,BarcodeScanner,TEST,False,"[{'segment_id': 0, 'segment_text': 'Skip to co..."


In [19]:
def add_metadata_to_policy_df(input_policies_df):
    
    all_policies_df = input_policies_df.copy()
    
    # New columns to populate
    all_policies_df["num_segments"] = 0
    all_policies_df["num_annotated_segs"] = 0
    all_policies_df["total_characters"] = 0
    
    # Loop through each policy
    
    for i in range(len(all_policies_df["segments"])):
        segment = all_policies_df.loc[i, "segments"] # grab the policy
        
        policy_segment_df = json_normalize(segment) # apply json_normalize
        
        policy_segment_df.set_index('segment_id', inplace=True)
        
        all_policies_df.loc[i, "num_segments"] = policy_segment_df["segment_text"].count() # count the sentences and add to the main df

        policy_segment_df.loc[ policy_segment_df["annotations"].str.len() == 0 , "annotations"] = None # clean the annotations column
        
        all_policies_df.loc[i, "num_annotated_segs"] = policy_segment_df["annotations"].count() # count the annotated segments

        all_policies_df.loc[i, "total_characters"] = policy_segment_df["segment_text"].str.len().sum() # count the characters
        
    return all_policies_df

In [20]:
all_policies_df = add_metadata_to_policy_df(all_policies_df)
all_policies_df.head(5)

Unnamed: 0,policy_id,policy_name,policy_type,contains_synthetic,segments,num_segments,num_annotated_segs,total_characters
0,1,6677G,TEST,False,"[{'segment_id': 0, 'segment_text': 'PRIVACY PO...",36,11,12703
1,2,AIFactory,TEST,False,"[{'segment_id': 0, 'segment_text': 'AI Factory...",14,5,5995
2,3,AppliqatoSoftware,TEST,False,"[{'segment_id': 0, 'segment_text': 'Automatic ...",8,1,2450
3,4,BandaiNamco,TEST,False,"[{'segment_id': 0, 'segment_text': 'MOBILE APP...",57,14,32323
4,5,BarcodeScanner,TEST,False,"[{'segment_id': 0, 'segment_text': 'Skip to co...",32,3,6667


# Making segments

Now to make a new dataframe where each row represents a paragraph (segment).

First I will get this to work for a single policy. Then I will loop through all the policies to apply the same manipulation.

In [25]:
def generate_segment_df(all_policies_df):
    
    # First create this for a single policy. Then loop through all the policies to apply the same manipulation.
    initial_segment = all_policies_df.loc[0,"segments"]
    initial_segment_df = json_normalize(initial_segment) # normalise it
    initial_segment_df.set_index('segment_id', inplace=True)
    initial_segment_df["source_policy_number"] = all_policies_df.loc[0,"policy_id"]

    segment_df = initial_segment_df
    
    for i in all_policies_df.index:
        
        this_segment = all_policies_df.loc[i,"segments"]
        this_segment_df = json_normalize(this_segment) # normalise it
        this_segment_df.set_index('segment_id', inplace=True)
        this_segment_df["source_policy_number"] = all_policies_df.loc[i,"policy_id"]

        segment_df = pd.concat( [segment_df, this_segment_df], axis=0 ) 
    
    segment_df['policy_segment_id'] = segment_df.index
    segment_df.reset_index(drop=True, inplace=True)
    segment_df.index.names = ['segment_id']
    segment_df = segment_df[['source_policy_number', 'policy_segment_id', 'segment_text', 'annotations', 'sentences']]
    
    return segment_df

In [24]:
all_segments_df = generate_segment_df(all_policies_df)
all_segments_df.head()

Unnamed: 0_level_0,source_policy_number,policy_segment_id,segment_text,annotations,sentences
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0,PRIVACY POLICY This privacy policy (hereafter ...,[],[]
1,1,1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,[],[]
2,1,2,2. THE INFORMATION WE COLLECT The information ...,[{'practice': 'Identifier_Cookie_or_similar_Te...,"[{'sentence_text': 'IP ADDRESS, COOKIES, AND W..."
3,1,3,"2.2 In addition, we store certain information ...",[{'practice': 'Identifier_Cookie_or_similar_Te...,"[{'sentence_text': '6677g may use cookies, web..."
4,1,4,(c) to remember your preferences and registrat...,[],[]


In [28]:
segment_df.shape

(15543, 5)

In [30]:
segment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15543 entries, 0 to 15542
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   source_policy_number  15543 non-null  int64 
 1   policy_segment_id     15543 non-null  int64 
 2   segment_text          15543 non-null  object
 3   annotations           15543 non-null  object
 4   sentences             15543 non-null  object
dtypes: int64(2), object(3)
memory usage: 607.3+ KB


# Next step of extraction

In [35]:
all_segments_df.loc[2,:]

source_policy_number                                                    1
policy_segment_id                                                       2
segment_text            2. THE INFORMATION WE COLLECT The information ...
annotations             [{'practice': 'Identifier_Cookie_or_similar_Te...
sentences               [{'sentence_text': 'IP ADDRESS, COOKIES, AND W...
Name: 2, dtype: object

In [29]:
all_segments_df.loc[2, 'segment_text']

"2. THE INFORMATION WE COLLECT The information that our products collect includes (among others) the following: A) IP ADDRESS, COOKIES, AND WEB BEACONS 2.1 When you visit our products, our servers automatically save your computer's IP address. IP addresses will be collected, along with information about the actual web pages that you visit on our products. If you arrive at our products via a link from another product, the URL of the linking product and the URL of any product that you link to next will also be collected."

In [36]:
all_segments_df.loc[2, 'annotations']

[{'practice': 'Identifier_Cookie_or_similar_Tech_1stParty',
  'modality': 'PERFORMED'},
 {'practice': 'Identifier_IP_Address_1stParty', 'modality': 'PERFORMED'}]

In [37]:
all_segments_df.loc[2, 'sentences']

[{'sentence_text': 'IP ADDRESS, COOKIES, AND WEB BEACONS',
  'annotations': [{'practice': 'Identifier_Cookie_or_similar_Tech_1stParty',
    'modality': 'PERFORMED'},
   {'practice': 'Identifier_IP_Address_1stParty', 'modality': 'PERFORMED'}]},
 {'sentence_text': 'IP addresses will be collected, along with information about the actual web pages that you visit on our products.',
  'annotations': [{'practice': 'Identifier_IP_Address_1stParty',
    'modality': 'PERFORMED'}]},
 {'sentence_text': 'The information that our products collect includes (among others) the following:',
  'annotations': [{'practice': 'Identifier_Cookie_or_similar_Tech_1stParty',
    'modality': 'PERFORMED'},
   {'practice': 'Identifier_IP_Address_1stParty', 'modality': 'PERFORMED'}]},
 {'sentence_text': "When you visit our products, our servers automatically save your computer's IP address.",
  'annotations': [{'practice': 'Identifier_IP_Address_1stParty',
    'modality': 'PERFORMED'}]}]

In [38]:
all_segments_df.loc[3, 'segment_text']

'2.2 In addition, we store certain information from your browser, using "cookies." A cookie is a piece of data stored on the user\'s computer and is tied to information about the user. 6677g may use cookies, web beacons (web bugs), or similar technologies to enhance and personalize your experience on our products, including the following: (a) to operate and improve offerings on our products; (b) to help authenticate you when you are on our products;'

In [39]:
all_segments_df.loc[4, 'segment_text']

'(c) to remember your preferences and registration information, as applicable;'

In [40]:
all_segments_df.loc[5, 'segment_text']

'(d) to present and help measure and research the effectiveness of 6677g offerings, advertisements, and email communications; and'

In [41]:
all_segments_df.loc[6, 'segment_text']

'(e) to customize the content and advertisements provided to you through our products.'

In [42]:
all_segments_df.loc[7, 'segment_text']

'2.3 6677g may also use ad network providers to help present advertisements on our products. These ad network providers use cookies, web beacons, or similar technologies to help the presenting, better targeting, and measuring of the effectiveness of their advertisements, using data gathered over time and across their networks of web pages to determine or predict the characteristics and preferences of their audiences. 6677g offers some services in connection with other products. Personal information that you provide to those sites may be sent to 6677g in order to deliver these services. 6677g processes such information in accordance with this Privacy Policy.'

I probably want to separate it out to the sentence level, as that is the max granularity of the annotations and some paragraphs are just one sentence anyway.

## Demo EDA