In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

raw_data = pd.read_excel('epitech_cookie_compliance.xlsx')




In [97]:
# Making a copy of the raw data to work with
df = raw_data.copy()
df.head(3)

Unnamed: 0,Cookie ID,Cookie Name,Domain,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Secure,HttpOnly,...,SameSite,Path,Consent Required,Creation Date,Last Accessed,Priority,Partitioned,Size (KB),Duration,Host Only
0,1,Chocolate Crinkle_2,spxflow.com,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,False,True,...,Strict,/,True,03/03/2024,08/10/2024,Low,True,7.6,Persistent,False
1,2,Nankhatai_1,hitachi.com,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,False,False,...,Strict,/products,True,01/14/2024,08/02/2024,Low,False,6.3,Persistent,True
2,3,Chocolate Wafer_6,electrolux.com,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,False,True,...,Lax,/user,True,09/07/2024,10/12/2024,Low,False,2.4,Session,False


In [98]:
# checking for duplicates
df.duplicated().sum()

0

In [99]:
# checking for nulls
df.isnull().sum()

Cookie ID                                                         0
Cookie Name                                                       0
Domain                                                            0
Expires / Max-Age (in seconds)                                    0
Origin                                                            0
SameParty (if cookie keeps data locally or sends it outside)      0
Purpose                                                           0
Data Collected                                                    0
Secure                                                            0
HttpOnly                                                          0
Cookie Policy                                                     0
Cookie Banner                                                     0
Cookie Options                                                   66
SameSite                                                        186
Path                                            

In [100]:
# Clean and transform
def clean_and_transform(df):
    # Dropping irrelevant fields
    df.drop(['Cookie Name','Domain','Secure', 'HttpOnly', 'SameSite', 'Path', 'Priority', 'Partitioned','Creation Date', 'Last Accessed', 'Size (KB)', 'Host Only'], axis = 1, inplace = True)
    
    # cookies expiration period converted from seconds to years
    df['expiration (in years)'] = (((df['Expires / Max-Age (in seconds)']/60)/60)/24)/365

    return df

clean_and_transform(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years)
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0


In [101]:
# Check: Cookie retention compliance
def is_retention_compliant(df):
    df['Retention compliant'] = ((df['Duration'] == 'Session')&(df['expiration (in years)'] <1))|((df['Duration'] == 'Persistent')&(df['expiration (in years)'] < 2))
    return df

is_retention_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True


In [102]:
# Split into essential and non-essential based on purpose
essential_purposes = {'Functional', 'Operational Efficiency', 'Legal Obligations', 'Compliance', 'Fraud Prevention', 'Security'}
non_essential_purposes = {'Analytics', 'Service Improvement',
       'Content Customization', 'Advertising', 'Customer Support',
       'Tracking', 'Social Media','Personalization', 'E-commerce', 'Compliance',
       'Performance Monitoring', 'Market Research', 'User Experience',
       'Customer Feedback'}

def split_essential_and_non_essential(df):
    alist = []
    for i in range(len(df)):       
        if df['Purpose'][i] in essential_purposes:
            alist.append('Essential')
            
        else:
            alist.append('Non-Essential')
    df['Essential/Non-essential'] = alist
    return df

split_essential_and_non_essential(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential


In [103]:
# Key options required in the banner
required_options = {'Decline All', 'Accept All', 'Customize'}

# Check: Cookie banner compliance for non-essential cookies
def is_banner_compliant(df):
    
   alist=[]
   for i in range(len(df)):      
      if (df['Essential/Non-essential'][i]== 'Non-Essential')&(df['Cookie Banner'][i] == True):
         x= df['Cookie Options'][i]
         if required_options.issubset(x):
            alist.append(True)
         else:
            alist.append(False)
      else:
         alist.append(True)

   df['Banner_compliant'] = alist
   return df

is_banner_compliant(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True


In [None]:
# Key phrases to check for policy compliance
policy_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin'}
policy_third_party = {'Third-Party Processing'}
policy_non_essential = {'Types of CookiesHere are some examples of the types of cookies we use:', 'Cookie Origin', 'Third-Party Processing', 'Consent','Managing Cookies', 'What Are Your Rights'}

# Check: Cookie policy compliance
def is_policy_compliant(df): 
    alist = []
    for i in range(len(df['Cookie Policy'])):
        x = {df['Cookie Policy'][i]}
        x= df['Cookie Policy'].str.split('\n')[i] 
        
        if df['Essential/Non-essential'][i]=='Essential':
            if (df['Origin'][i] == 'First-party')&(policy_essential.issubset(x)):
                alist.append(True)
            elif (df['Origin'][i] == 'Third-party')&((policy_essential & policy_third_party).issubset(x)):
                alist.append(True)
            else:
                alist.append(False)

        elif df['Essential/Non-essential'][i]=='Non-Essential':
            if (df['Origin'][i] == 'First-party')&(policy_non_essential.issubset(x)):
                alist.append(True)
            elif (df['Origin'][i] == 'Third-party')&((policy_non_essential & policy_third_party).issubset(x)):
                alist.append(True)
            else:
                alist.append(False)

    df['Policy compliant'] = alist  
    return df

is_policy_compliant(df)
df.head(2)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True


In [109]:
# Final Compliance Check

def total_compliance_check(df):
    alist=[]
    for i in range(len(df)):
        if df['Retention compliant'][i]==True:
            if df['Banner_compliant'][i]==True:
                if df['Policy compliant'][i]==True:
                    alist.append(True)
                else:
                    alist.append(False)
            else:
                alist.append(False)
        else:
            alist.append(False)
    df['Is compliant'] = alist

    return df

total_compliance_check(df)
df.head(3)

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant,Is compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754,True,Essential,True,False,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.0,True,Non-Essential,False,True,False
2,3,-1,First-party,True,Functional,Flash Version; Referring URL; Operating System...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,-3.170979e-08,True,Essential,True,True,True


In [141]:
# Summary of Compliance
compliance_summary = {
    'Total Cookies': len(df),
    'Session Cookies': (df['Duration']=='Session').sum(),
    'Persistent Cookies': (df['Duration']=='Persistent').sum(),
    'Essential Cookies': (df['Essential/Non-essential']=='Essential').sum(),
    'Non-Essential Cookies': (df['Essential/Non-essential']=='Non-Essential').sum(),
    'Compliant Cookies': (df['Is compliant']==True).sum(),
    'Non-Compliant Cookies': (df['Is compliant']==False).sum(),
    'Compliance Rate (%)': (((df['Is compliant']==True).sum())/(df['Is compliant'].count())) * 100
}

compliance_summary

{'Total Cookies': 750,
 'Session Cookies': 379,
 'Persistent Cookies': 371,
 'Essential Cookies': 224,
 'Non-Essential Cookies': 526,
 'Compliant Cookies': 73,
 'Non-Compliant Cookies': 677,
 'Compliance Rate (%)': 9.733333333333333}

In [145]:
# Detailed breakdown of non-compliance reasons
non_compliant_cookies = df[df['Is compliant'] == False]

In [None]:
# Save the analyzed data to a CSV file
#cookies.to_csv(output_filename, index=False)
df.to_excel

Unnamed: 0,Cookie ID,Expires / Max-Age (in seconds),Origin,SameParty (if cookie keeps data locally or sends it outside),Purpose,Data Collected,Cookie Policy,Cookie Banner,Cookie Options,Consent Required,Duration,expiration (in years),Retention compliant,Essential/Non-essential,Banner_compliant,Policy compliant,Is compliant
0,1,60478299,First-party,True,Functional,Network Type; IP Address; Referring URL; Adver...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,1.917754e+00,True,Essential,True,False,False
1,2,0,First-party,True,Analytics,Advertising Preferences; Device Information; V...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Accept All, Customize cookies",True,Persistent,0.000000e+00,True,Non-Essential,False,True,False
3,4,0,First-party,True,Service Improvement,Browser; Language Preferences; Download Histor...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Session,0.000000e+00,True,Non-Essential,False,False,False
4,5,0,First-party,True,Content Customization,Network Type; Visited Pages; Error Reports; Fl...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,Accept All,True,Session,0.000000e+00,True,Non-Essential,False,False,False
5,6,195683602,First-party,True,Advertising,App Usage; Subscription Status; Payment Inform...,No policy informed,True,"Accept All, Customize cookies",True,Session,6.205086e+00,False,Non-Essential,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,746,145495670,First-party,True,Personalization,IP Address; Error Reports; App Usage; Time Spe...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,Accept All,True,Persistent,4.613637e+00,False,Non-Essential,False,False,False
746,747,90423355,First-party,True,Tracking,Clickstream Data; JavaScript Enabled; Session ...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,2.867306e+00,False,Non-Essential,False,False,False
747,748,60060267,First-party,True,Performance Monitoring,Subscription Status; Language Preferences; Bat...,Cookie Policy\n\nIntroduction\nWe use cookies ...,False,,True,Persistent,1.904499e+00,True,Non-Essential,True,False,False
748,749,-1,Third-party,True,Customer Feedback,Search Queries; Clickstream Data; App Usage; F...,Cookie Policy\n\nIntroduction\nWe use cookies ...,True,"Decline All, Accept All, Customize cookies",True,Persistent,-3.170979e-08,True,Non-Essential,False,False,False
