In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

raw_data = pd.read_excel('cookie_compliance_data.xlsx')

In [19]:
# Making a copy of the raw data to work with
df = raw_data.copy()
df.head(3)

Unnamed: 0,Cookie ID,Cookie Name,Domain,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Secure,HttpOnly,...,SameSite,Path,Consent Required,Creation Date,Last Accessed,Priority,Partitioned,Size (KB),Duration,Host Only
0,1,Chocolate Crinkle_2,spxflow.com,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,False,True,...,Strict,/,True,03/03/2024,08/10/2024,Low,True,7.6,Persistent,False
1,2,Nankhatai_1,hitachi.com,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,False,False,...,Strict,/products,True,01/14/2024,08/02/2024,Low,False,6.3,Persistent,True
2,3,Chocolate Wafer_6,electrolux.com,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,False,True,...,Lax,/user,True,09/07/2024,10/12/2024,Low,False,2.4,Session,False


In [20]:
# checking for duplicates
df.duplicated().sum()

0

In [21]:
# checking for nulls
df.isnull().sum()

Cookie ID                                                         0
Cookie Name                                                       0
Domain                                                            0
Expires / Max-Age (in seconds)                                    0
Origin                                                            0
SameParty (if cookie keeps data locally or sends it outside)      0
Purpose                                                           0
Data Collected                                                    0
Secure                                                            0
HttpOnly                                                          0
Cookie Policy                                                     0
Cookie Banner                                                     0
Cookie Options                                                   66
SameSite                                                        186
Path                                            

In [22]:
# Function to perform cleaning and transformation
def clean_and_transform(df):
    # Dropping irrelevant fields
    df.drop(['Cookie Name','Domain','Secure', 'HttpOnly', 'SameSite', 'Path', 'Priority', 'Partitioned','Creation Date', 'Last Accessed', 'Size (KB)', 'Host Only'], axis = 1, inplace = True)
    
    # cookies expiration period converted from seconds to years
    df['expiration (in years)'] = (((df['Expires / Max-Age (in seconds)']/60)/60)/24)/365

    return df                           # returns updated dataframe

clean_and_transform(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years)
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0


In [23]:
# Check: Cookie retention compliance
def is_retention_compliant(df):
    
    # if cookie is session type and expiration < 1 year, or if cookie is persistent type and expiration < 2 years then the cookie is retention compliant 
    # new boolean field is created in the dataframe that specify whether cookie is retention compliant or not based on the above criteria:
    df['Retention compliant'] = ((df['Duration'] == 'Session')&(df['expiration (in years)'] <1))|((df['Duration'] == 'Persistent')&(df['expiration (in years)'] < 2))
    
    return df                                                       # function returns the updated dataframe

is_retention_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True


In [24]:
# cookie purpose that will be considered essential:
essential_purposes = {'Functional', 'Service Improvement', 'Customer Support', 'Operational Efficiency', 'Legal Obligations', 'Compliance', 'Fraud Prevention', 'Security'}

# cookie purpose that will be considered non-essential:
non_essential_purposes = {'Analytics', 'Content Customization', 'Advertising',
                          'Tracking', 'Social Media','Personalization', 'E-commerce', 'Compliance',
                          'Performance Monitoring', 'Market Research', 'User Experience','Customer Feedback'}

def split_essential_and_non_essential(df):             # Function to split the cookies into essential and non-essential based on purpose as labeled above
    alist = []
    for i in range(len(df)):       
        if df['Purpose'][i] in essential_purposes:     # if purpose is any of the keywords in 'essential_purposes' set then it'll be labeled as 'Essential'
            alist.append('Essential')
            
        else:                                          # else it'll be labeled as 'Non-Essential'
            alist.append('Non-Essential')
    df['Essential/Non-essential'] = alist              # New boolean field created that specify if cookie is 'Essential' or 'Non-Essential'
    return df                                          # function returns the updated dataframe

split_essential_and_non_essential(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential


In [25]:
# Key options required in the banner
required_options = {'Decline All', 'Accept All', 'Customize'}

# Check: Cookie banner compliance for non-essential cookies, this is not required for essential cookies
def is_banner_compliant(df):                                            
    
   alist=[]
   for i in range(len(df)):      
      if (df['Essential/Non-essential'][i]== 'Non-Essential'):           # if cookie is non-essential then execute next line to check if cookie banner exists
         if (df['Cookie Banner'][i] == True):                            # if cookie banner exists, then execute next line to check if it contains key phrases within 'required_options' set
            x= df['Cookie Options'][i]
            if required_options.issubset(x):                             # if the banner contains key phrases within 'required_options' set then returns 'True'
               alist.append(True)
            else:                                                        # else returns 'False'
               alist.append(False)
         else:
            alist.append(False)
      else:                                                              # if cookie isn't non-essential, meaning it's essential then returns 'True'
         alist.append(True)                                              # since banner check isn't required for essential cookies

   df['Banner_compliant'] = alist                                        # New boolean field created that specify if cookie is banner compliant or not
   return df                                                             # function returns the updated dataframe

is_banner_compliant(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True


In [26]:
df['Data Collected '][739]

'Search Queries; Referring URL; App Usage; Email Address; JavaScript Enabled'

In [27]:
df['Data Collected '][739].split(';')

['Search Queries',
 ' Referring URL',
 ' App Usage',
 ' Email Address',
 ' JavaScript Enabled']

In [28]:
def collect_data(data, i):                                              # Function to get data collected by cookies  
    for i in range(len(data)):
        words_list = (df['Data Collected '][i].split(';'))              # splitting the content of 'Data collected ' with the delimiter as ';'
    return words_list                                                   # Returns a list of all data collected by a cookie

In [29]:
collect_data(df, 2)

['Device Information',
 ' Local Time',
 ' JavaScript Enabled',
 ' Session Data',
 ' Browser']

In [30]:
# Key phrases to check for policy compliance in 'Cookie Policy' if cookie is essential
policy_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin'}

# Key phrases to check for policy compliance in 'Cookie Policy' if cookie origin is third-party
policy_third_party = {'Third-Party Processing'}

# Key phrases to check for policy compliance in 'Cookie Policy' if cookie is non-essential
policy_non_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin', 'Consent','Managing Cookies'}

# Key phrases to check for policy compliance in 'Cookie Policy' if data collected by cookie contains personal data
user_right_policy = {'What Are Your Rights'}

# These are data considered as personal data
personal_data = {'IP Address', 'Session Data', 'Email Address', 'Phone Number', 'Payment Information',
                 'Geolocation', 'Purchase History', 'Download History'}

In [31]:
def is_personal_data(dataframe, collected_data, cookie_policy_in_set):  # Function to check cookie policy compliance if personal data is in collected data
    for item in collected_data:                                                 
        if any(item in personal_data for item in collected_data):       # If personal data is in collected data, 
            if user_right_policy.issubset(cookie_policy_in_set):        # then checking user right is informed in cookie policy
                return True                                             # If user right is informed then returns 'True'
            return False                                                # else, returns 'False'
    return True                                                         # If there is no personal data in collected data then returns 'True'

In [32]:
# Check: Cookie policy compliance
def is_policy_compliant(df): 

    alist=[]
    for i in range(len(df['Cookie Policy'])):
        cookie_policy_in_set = set(df['Cookie Policy'][i].split('\n'))  # Converting cookie policy content into a set of data
        cookie_purpose = {df['Purpose'][i]}                             # Converting cookie purpose into a set for checking in other sets of data later
        collected_data = collect_data(df, i)                            # calling 'collect_data' function to get list of data collected by each cookie

        # Check for essential cookies
        if cookie_purpose.issubset(essential_purposes):

            # If cookie is essential and stored by third-party then checks cookie policy contains the key phrases within 'policy_essential' and 'policy_third_party'sets:
            if (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == False) and (policy_essential & policy_third_party).issubset(cookie_policy_in_set):
                # calling 'is_personal_data' function to check it's compliance
                alist.append(is_personal_data(df['Origin'][i], collected_data, cookie_policy_in_set))
            
            # If cookie is essential and stored by same party then checks the key phrase contains the key phrases within 'policy_essential' set:
            elif (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_essential).issubset(cookie_policy_in_set):
                # calling 'is_personal_data' function to check it's compliance
                alist.append(is_personal_data(df['Origin'][i], collected_data, cookie_policy_in_set))

            else:
                alist.append(False)

        # Check for non-essential cookies
        elif cookie_purpose.issubset(non_essential_purposes):
            
            # If cookie is essential and stored by third-party then checks cookie policy contains the key phrases within 'policy_non_essential' and 'policy_third_party'sets:
            if (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == False) and (policy_non_essential & policy_third_party).issubset(cookie_policy_in_set):
                # calling 'is_personal_data' function to check it's compliance
                alist.append(is_personal_data(df['Origin'][i], collected_data, cookie_policy_in_set))
            
            # If cookie is essential and stored by same party then checks the key phrase contains the key phrases within 'policy_non_essential' set:
            elif (df['SameParty (if cookie keeps data locally or sends it outside)'][i] == True) and (policy_non_essential).issubset(cookie_policy_in_set):
                # calling 'is_personal_data' function to check it's compliance
                alist.append(is_personal_data(df['Origin'][i], collected_data, cookie_policy_in_set))

            else:
                alist.append(False)

    df['Policy compliant'] = alist                                # New boolean field created that specify if cookie is policy compliant or not
    return df                                                     # function returns the updated dataframe

is_policy_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True


In [33]:
# Final Compliance Check

def total_compliance_check(df):
    alist=[]
    for i in range(len(df)):
        if df['Retention compliant'][i]==True:                  # if cookie is retention compliant then execute the next line to check banner is compliant
            if df['Banner_compliant'][i]==True:                 # if cookie is banner compliant then execute next line to check cookie policy is compliant
                if df['Policy compliant'][i]==True:             # if cookie is policy compliant,
                    alist.append(True)                          # then returns 'True'
                else:                                           # else returns 'False'
                    alist.append(False)
            else:
                alist.append(False)
        else:
            alist.append(False)
    df['Is compliant'] = alist                                  # New boolean field added to dataframe that specify whether cookie is finally compliant or not
    return df                                                   # function returns the updated dataframe

total_compliance_check(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant,Is compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True,True,True


In [None]:
# Breakdown of analysis in a summary:
compliance_summary = {
    'Overall compliance Rate (%)': (((df['Is compliant']==True).sum())/(df['Is compliant'].count())) * 100,
    'Retention compliance rate (%)': ((df['Retention compliant']==True).sum()/(df['Is compliant'].count()))*100,
    'Consent compliance rate (%)': ((df['Banner_compliant']==True).sum()/(df['Is compliant'].count()))*100,
    'Transparency compliance rate (%)': ((df['Policy compliant']==True).sum()/(df['Is compliant'].count()))*100,

    'Total compliant cookies': (df['Is compliant']==True).sum(),
    'Non-compliant cookies': (df['Is compliant']==False).sum(),

    'Compliant essential cookies': df[(df['Essential/Non-essential']=='Essential') & (df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant non-essential cookies': df[(df['Essential/Non-essential']=='Non-Essential') & (df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant session cookies': df[(df['Duration']=='Session')&(df['Is compliant']==True)]['Is compliant'].count(),
    'Compliant persistent cookies': df[(df['Duration']=='Persistent')&(df['Is compliant']==True)]['Is compliant'].count(),
    
    'Total Cookies': len(df),
    'Session Cookies': (df['Duration']=='Session').sum(),
    'Persistent Cookies': (df['Duration']=='Persistent').sum(),
    'Essential Cookies': (df['Essential/Non-essential']=='Essential').sum(),
    'Non-Essential Cookies': (df['Essential/Non-essential']=='Non-Essential').sum()    
}

compliance_summary

{'Overall compliance Rate (%)': 12.266666666666666,
 'Retention compliance rate (%)': 78.4,
 'Banner compliance rate (%)': 40.53333333333333,
 'Policy compliance rate (%)': 29.86666666666667,
 'Total compliant cookies': 92,
 'Non-compliant cookies': 658,
 'Compliant essential cookies': 92,
 'Compliant non-essential cookies': 0,
 'Compliant session cookies': 42,
 'Compliant persistent cookies': 50,
 'Total Cookies': 750,
 'Session Cookies': 379,
 'Persistent Cookies': 371,
 'Essential Cookies': 304,
 'Non-Essential Cookies': 446}

In [37]:
# Testing the cookie policy compliant rule is working:
import random
num = random.randint(0,750)                       # generating a random number between 0-750 to analyze a random sample

print(df['Purpose'][num])
print(df['Origin'][num])
result = df['Policy compliant'][num]
print(f'compliant: {result}')
print(num)
print(df['Data Collected '][num])
print(df['Cookie Policy'][num])

Analytics
Third-party
compliant: False
598
Device Information; Screen Resolution; Session Data; ISP
Cookie Policy

Introduction
We use cookies to enhance your experience on our website. By using our site, you consent to our use of cookies.

Types of CookiesHere are some examples of the types of cookies we use:
Essential Cookies: Necessary for website functionality.
Performance Cookies: Help us understand how you use our site.
Functional Cookies: Remember your preferences.
Advertising Cookies: Deliver relevant ads.

Cookie Origin
First-Party Cookies: Set by our website.
Third-Party Cookies: Set by external services.

What Are Your Rights
To obtain information on your rights concerning the data we collect through cookies and how you can exercise them, please read our Privacy Policy.

Contact Us
For questions, contact us at below email.
