In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

raw_data = pd.read_excel('cookie_compliance.xlsx')

In [121]:
# Making a copy of the raw data to work with
df = raw_data.copy()
df.head(3)

Unnamed: 0,Cookie ID,Cookie Name,Domain,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Secure,HttpOnly,...,SameSite,Path,Consent Required,Creation Date,Last Accessed,Priority,Partitioned,Size (KB),Duration,Host Only
0,1,Chocolate Crinkle_2,spxflow.com,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,False,True,...,Strict,/,True,03/03/2024,08/10/2024,Low,True,7.6,Persistent,False
1,2,Nankhatai_1,hitachi.com,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,False,False,...,Strict,/products,True,01/14/2024,08/02/2024,Low,False,6.3,Persistent,True
2,3,Chocolate Wafer_6,electrolux.com,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,False,True,...,Lax,/user,True,09/07/2024,10/12/2024,Low,False,2.4,Session,False


In [122]:
# checking for duplicates
df.duplicated().sum()

0

In [123]:
# checking for nulls
df.isnull().sum()

Cookie ID                                                         0
Cookie Name                                                       0
Domain                                                            0
Expires / Max-Age (in seconds)                                    0
Origin                                                            0
SameParty (if cookie keeps data locally or sends it outside)      0
Purpose                                                           0
Data Collected                                                    0
Secure                                                            0
HttpOnly                                                          0
Cookie Policy                                                     0
Cookie Banner                                                     0
Cookie Options                                                   66
SameSite                                                        186
Path                                            

In [124]:
# Clean and transform
def clean_and_transform(df):
    # Dropping irrelevant fields
    df.drop(['Cookie Name','Domain','Secure', 'HttpOnly', 'SameSite', 'Path', 'Priority', 'Partitioned','Creation Date', 'Last Accessed', 'Size (KB)', 'Host Only'], axis = 1, inplace = True)
    
    # cookies expiration period converted from seconds to years
    df['expiration (in years)'] = (((df['Expires / Max-Age (in seconds)']/60)/60)/24)/365

    return df

clean_and_transform(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years)
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0


In [125]:
# Check: Cookie retention compliance
def is_retention_compliant(df):
    df['Retention compliant'] = ((df['Duration'] == 'Session')&(df['expiration (in years)'] <1))|((df['Duration'] == 'Persistent')&(df['expiration (in years)'] < 2))
    return df

is_retention_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True


In [126]:
# Split into essential and non-essential based on purpose
essential_purposes = {'Functional', 'Service Improvement', 'Customer Support', 'Operational Efficiency', 'Legal Obligations', 'Compliance', 'Fraud Prevention', 'Security'}
non_essential_purposes = {'Analytics', 'Content Customization', 'Advertising',
                          'Tracking', 'Social Media','Personalization', 'E-commerce', 'Compliance',
                          'Performance Monitoring', 'Market Research', 'User Experience','Customer Feedback'}

def split_essential_and_non_essential(df):
    alist = []
    for i in range(len(df)):       
        if df['Purpose'][i] in essential_purposes:
            alist.append('Essential')
            
        else:
            alist.append('Non-Essential')
    df['Essential/Non-essential'] = alist
    return df

split_essential_and_non_essential(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential


In [128]:
# Key options required in the banner
required_options = {'Decline All', 'Accept All', 'Customize'}

# Check: Cookie banner compliance for non-essential cookies
def is_banner_compliant(df):
    
   alist=[]
   for i in range(len(df)):      
      if (df['Essential/Non-essential'][i]== 'Non-Essential'):
         if (df['Cookie Banner'][i] == True):
            x= df['Cookie Options'][i]
            if required_options.issubset(x):
               alist.append(True)
            else:
               alist.append(False)
         else:
            alist.append(False)
      else:
         alist.append(True)

   df['Banner_compliant'] = alist
   return df

is_banner_compliant(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True


In [129]:
df['Data Collected '][739]

'Search Queries; Referring URL; App Usage; Email Address; JavaScript Enabled'

In [130]:
df['Data Collected '][739].split(';')

['Search Queries',
 ' Referring URL',
 ' App Usage',
 ' Email Address',
 ' JavaScript Enabled']

In [131]:
def collect_data(data, i):
    
    for i in range(len(data)):
        words_list = (df['Data Collected '][i].split(';'))
    
    return words_list

In [132]:
collect_data(df, 2)

['Device Information',
 ' Local Time',
 ' JavaScript Enabled',
 ' Session Data',
 ' Browser']

In [133]:
# Key phrases to check for policy compliance
policy_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin'}
policy_third_party = {'Third-Party Processing'}
policy_non_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin', 'Third-Party Processing', 'Consent','Managing Cookies'}
user_right_policy = {'What Are Your Rights'}
personal_data = {'IP Address', 'Session Data', 'Email Address', 'Phone Number', 'Payment Information',
                 'Geolocation', 'Purchase History', 'Download History'}

In [149]:
set(df['Cookie Policy'][0].split('\n'))

{'',
 'Contact Us',
 'Cookie Policy',
 'For questions, contact us at below email.',
 'Introduction',
 'Managing Cookies',
 'To obtain information on your rights concerning the data we collect through cookies and how you can exercise them, please read our Privacy Policy.',
 'We use cookies to enhance your experience on our website. By using our site, you consent to our use of cookies.',
 'What Are Your Rights',
 'You can control cookies through your browser settings. Blocking cookies may affect website functionality.'}

In [134]:
def is_personal_data(dataframe, collected_data, set_of_data, policy_set):
    
    for item in collected_data:
        if any(item in personal_data for item in collected_data):
            if user_right_policy.issubset(set_of_data):
                return True
            return False
    return True

In [136]:
# Check: Cookie policy compliance
def is_policy_compliant(df): 

    alist=[]
    for i in range(len(df['Cookie Policy'])):
        set_of_data = set(df['Cookie Policy'][i].split('\n'))  # Convert to a set of data
        y = {df['Purpose'][i]}  # Purpose as a set
        collected_data = collect_data(df, i)

        # Check for essential cookies
        if y.issubset(essential_purposes):
            if (df['Origin'][i] == 'First-party') and policy_essential.issubset(set_of_data):
                alist.append(is_personal_data(df['Origin'][i], collected_data, set_of_data, policy_essential))
            elif (df['Origin'][i] == 'Third-party') and (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_essential | policy_third_party).issubset(set_of_data):
                alist.append(is_personal_data(df['Origin'][i], collected_data, set_of_data, policy_essential | policy_third_party))
            else:
                alist.append(False)

        # Check for non-essential cookies
        elif y.issubset(non_essential_purposes):
            if (df['Origin'][i] == 'First-party') and policy_non_essential.issubset(set_of_data):
                alist.append(is_personal_data(df['Origin'][i], collected_data, set_of_data, policy_non_essential))
            elif (df['Origin'][i] == 'Third-party') and (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_non_essential | policy_third_party).issubset(set_of_data):
                alist.append(is_personal_data(df['Origin'][i], collected_data, set_of_data, policy_non_essential | policy_third_party))
            else:
                alist.append(False)

    df['Policy compliant'] = alist
    return df

is_policy_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True


In [137]:
print(df['Data Collected '][0])
print(df['Cookie Policy'][0])

Network Type; IP Address; Referring URL; Advertising Preferences; Operating System
Cookie Policy

Introduction
We use cookies to enhance your experience on our website. By using our site, you consent to our use of cookies.

Managing Cookies
You can control cookies through your browser settings. Blocking cookies may affect website functionality.

What Are Your Rights
To obtain information on your rights concerning the data we collect through cookies and how you can exercise them, please read our Privacy Policy.

Contact Us
For questions, contact us at below email.


In [138]:
# Final Compliance Check

def total_compliance_check(df):
    alist=[]
    for i in range(len(df)):
        if df['Retention compliant'][i]==True:
            if df['Banner_compliant'][i]==True:
                if df['Policy compliant'][i]==True:
                    alist.append(True)
                else:
                    alist.append(False)
            else:
                alist.append(False)
        else:
            alist.append(False)
    df['Is compliant'] = alist

    return df

total_compliance_check(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant,Is compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True,True,True


In [139]:
# Summary of Compliance
compliance_summary = {
    'Overall compliance Rate (%)': (((df['Is compliant']==True).sum())/(df['Is compliant'].count())) * 100,
    'Retention compliance rate (%)': ((df['Retention compliant']==True).sum()/(df['Is compliant'].count()))*100,
    'Banner compliance rate (%)': ((df['Banner_compliant']==True).sum()/(df['Is compliant'].count()))*100,
    'Policy compliance rate (%)': ((df['Policy compliant']==True).sum()/(df['Is compliant'].count()))*100,
    'Total compliant cookies': (df['Is compliant']==True).sum(),
    'Non-compliant cookies': (df['Is compliant']==False).sum(),
    'Compliant essential cookies': df[(df['Essential/Non-essential']=='Essential') & (df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant non-essential cookies': df[(df['Essential/Non-essential']=='Non-Essential') & (df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant session cookies': df[(df['Duration']=='Session')&(df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant persistent cookies': df[(df['Duration']=='Persistent')&(df['Is compliant']==True)]['Is compliant'].count(),
    'Total Cookies': len(df),
    'Session Cookies': (df['Duration']=='Session').sum(),
    'Persistent Cookies': (df['Duration']=='Persistent').sum(),
    'Essential Cookies': (df['Essential/Non-essential']=='Essential').sum(),
    'Non-Essential Cookies': (df['Essential/Non-essential']=='Non-Essential').sum()    
}

compliance_summary

{'Overall compliance Rate (%)': 8.666666666666668,
 'Retention compliance rate (%)': 78.4,
 'Banner compliance rate (%)': 40.53333333333333,
 'Policy compliance rate (%)': 12.533333333333333,
 'Total compliant cookies': 65,
 'Non-compliant cookies': 685,
 'Compliant essential cookies': 65,
 'Compliant non-essential cookies': 0,
 'Compliant session cookies': 30,
 'Compliant persistent cookies': 35,
 'Total Cookies': 750,
 'Session Cookies': 379,
 'Persistent Cookies': 371,
 'Essential Cookies': 304,
 'Non-Essential Cookies': 446}

In [146]:
# Testing the cookie policy compliant rule is working:
import random
num = random.randint(0,750)

print(df['Purpose'][num])
print(df['Origin'][num])
result = df['Policy compliant'][num]
print(f'compliant: {result}')
print(num)
print(df['Data Collected '][num])
print(df['Cookie Policy'][num])

Content Customization
Third-party
compliant: False
13
Login Times; Local Time; Time Spent on Page; Operating System; Phone Number
Cookie Policy

Introduction
We use cookies to enhance your experience on our website. By using our site, you consent to our use of cookies.

Types of CookiesHere are some examples of the types of cookies we use:
Essential Cookies: Necessary for website functionality.
Performance Cookies: Help us understand how you use our site.
Functional Cookies: Remember your preferences.
Advertising Cookies: Deliver relevant ads.

Consent
We obtain your consent for non-essential cookies. You can manage your preferences through our cookie settings.

Managing Cookies
You can control cookies through your browser settings. Blocking cookies may affect website functionality.

What Are Your Rights
To obtain information on your rights concerning the data we collect through cookies and how you can exercise them, please read our Privacy Policy.

Contact Us
For questions, contact us

In [143]:
'''

from nltk import ngrams
def get_data_collected(data):
    n=1
    words_list = []
    for i in range(len(data)):
        unigrams = ngrams(data['Data Collected '][i].split(';'), n)

        for grams in unigrams:
            words_list.append(grams)

    return words_list
'''

"\n\nfrom nltk import ngrams\ndef get_data_collected(data):\n    n=1\n    words_list = []\n    for i in range(len(data)):\n        unigrams = ngrams(data['Data Collected '][i].split(';'), n)\n\n        for grams in unigrams:\n            words_list.append(grams)\n\n    return words_list\n"

In [144]:
'''def collect_data(n):
    data = df[n:n+1]
    data.reset_index(inplace=True)
    collected_tuple_list = get_data_collected(data)
    flattened = [item.strip() for sublist in collected_tuple_list for item in sublist]
    collected_data_series = pd.Series(flattened).unique()

    return collected_data_series'''

'def collect_data(n):\n    data = df[n:n+1]\n    data.reset_index(inplace=True)\n    collected_tuple_list = get_data_collected(data)\n    flattened = [item.strip() for sublist in collected_tuple_list for item in sublist]\n    collected_data_series = pd.Series(flattened).unique()\n\n    return collected_data_series'

In [145]:
'''# Key phrases to check for policy compliance
policy_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin'}
policy_third_party = {'Third-Party Processing'}
policy_non_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin', 'Third-Party Processing', 'Consent','Managing Cookies', 'What Are Your Rights'}
user_right_policy = {'What Are Your Rights'}
personal_data = {'IP Address', 'Session Data', 'Email Address', 'Phone Number', 'Payment Information',
                 'Geolocation', 'Purchase History', 'Download History'}
# Check: Cookie policy compliance
def is_policy_compliant(df): 
    def evaluate_policy(origin, collected_data, x, policy_set):
        for item in collected_data:
            if any(personal_item in personal_data for personal_item in collected_data):
                if user_right_policy.issubset(x):
                    return True
                return False
        return True

    # Iterate through the DataFrame rows
    alist=[]
    for i in range(len(df['Cookie Policy'])):
        x = set(df['Cookie Policy'][i].split('\n'))  # Convert to a set for subset checking
        y = {df['Purpose'][i]}  # Purpose as a set
        collected_data = collect_data(i)

        # Check for essential cookies
        if y.issubset(essential_purposes):
            if (df['Origin'][i] == 'First-party') and policy_essential.issubset(x):
                alist.append(evaluate_policy(df['Origin'][i], collected_data, x, policy_essential))
            elif (df['Origin'][i] == 'Third-party') and (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_essential | policy_third_party).issubset(x):
                alist.append(evaluate_policy(df['Origin'][i], collected_data, x, policy_essential | policy_third_party))
            else:
                alist.append(False)

        # Check for non-essential cookies
        elif y.issubset(non_essential_purposes):
            if (df['Origin'][i] == 'First-party') and policy_non_essential.issubset(x):
                alist.append(evaluate_policy(df['Origin'][i], collected_data, x, policy_non_essential))
            elif (df['Origin'][i] == 'Third-party') and (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_non_essential | policy_third_party).issubset(x):
                alist.append(evaluate_policy(df['Origin'][i], collected_data, x, policy_non_essential | policy_third_party))
            else:
                alist.append(False)

    df['Policy compliant'] = alist
    return df

is_policy_compliant(df)
df.head(2)'''

"# Key phrases to check for policy compliance\npolicy_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin'}\npolicy_third_party = {'Third-Party Processing'}\npolicy_non_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin', 'Third-Party Processing', 'Consent','Managing Cookies', 'What Are Your Rights'}\nuser_right_policy = {'What Are Your Rights'}\npersonal_data = {'IP Address', 'Session Data', 'Email Address', 'Phone Number', 'Payment Information',\n                 'Geolocation', 'Purchase History', 'Download History'}\n# Check: Cookie policy compliance\ndef is_policy_compliant(df): \n    def evaluate_policy(origin, collected_data, x, policy_set):\n        for item in collected_data:\n            if any(personal_item in personal_data for personal_item in collected_data):\n                if user_right_policy.issubset(x):\n                    return True\n                return False\n  