# Section 2, R2

The user should have the ability to delete their account. Imagine you are a
security analyst tasked with assessing whether an Android app complies with this
privacy requirement derived from regulations such as GDPR. 

In [1]:
import os
import sys
import xml.etree.ElementTree as ET
import re
import json
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime

# Configuration
SMALI_DIR = "./SMALI"
LOG_FILE = "./findings/account_deletion_findings.txt"

We define our keywords that we're looking for: 

In [2]:
# Patterns to search for
DELETION_PATTERNS = [
    "delete account",
    "deleteaccount",
    "remove account",
    "removeaccount",
    "account deletion",
    "accountdeletion",
    "delete my account",
    "deletemyaccount",
    "close account",
    "closeaccount",
    "deactivate account",
    "deactivateaccount",
    "delete profile",
    "deleteprofile",
    "gdpr delete",
    "permanent delete",
    "terminate account",
    "terminateaccount",
    "account removal",
    "accountremoval",
]


API_PATTERNS = [
    r"\/accounts?\/delete",
    r"\/users?\/delete",
    r"\/profile\/delete",
    r"\/auth\/delete",
    r"deleteUserAccount",
    r"deleteAccountPermanently",
    r"removeUserAccount",
    r"accountDeletionRequest",
]

Helper Functions, eg. to log the findings: 

In [49]:
def log_finding(app_name, finding_type, details):
    """Log findings to a file with timestamp"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(f"\n[{timestamp}] {app_name} - {finding_type}\n")
        f.write("-" * 80 + "\n")
        f.write(f"{details}\n")
        f.write("-" * 80 + "\n")

def get_app_directories(base_dir):
    """Get list of all app directories"""
    return [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

def generate_summary_report(all_findings):
    """Generate a summary report of account deletion capabilities"""
    report = "Account Deletion Functionality Analysis Summary\n"
    report += "==========================================\n\n"

    for app_findings in all_findings:
        app_name = app_findings["app_name"]
        report += f"\nApp: {app_name}\n"
        report += "-" * (len(app_name) + 5) + "\n"

        # Initialize evidence counters
        evidence = {
            "strings": len(app_findings["strings_analysis"]),
            "code": len(app_findings["smali_analysis"]),
        }

        # Determine likelihood of account deletion feature
        if sum(evidence.values()) > 0:
            report += "Evidence of account deletion functionality:\n"
            if evidence["strings"]:
                report += f"- Found {evidence['strings']} relevant string resources\n"
            if evidence["code"]:
                report += f"- Found {evidence['code']} relevant code patterns\n"

            likelihood = "High" if sum(evidence.values()) >= 10 else "Low"
            report += f"\nLikelihood of account deletion feature: {likelihood}\n"
        else:
            report += "No clear evidence of account deletion functionality found.\n"

        report += "\n" + "-" * 50 + "\n"

    return report

Function to analyse the `strings.xml` files: 

In [50]:
def analyze_strings_xml(app_dir, app_name):
    """Analyze strings.xml files for account deletion related strings"""
    deletion_strings = []
    try:
        # Look for strings.xml in the standard resource directories
        for values_dir in [
            "res/values",
            "res/values-en-rAU",
            "res/values-en-rCA",
            "res/values-en-rGB",
            "res/values-en-rIE",
            "res/values-en-rIN",
            "res/values-en-rNZ",
            "res/values-en-rUS",
            "res/values-en-rXC",
            "res/values-en-rZA",
        ]:
            strings_path = os.path.join(app_dir, values_dir, "strings.xml")
            if os.path.exists(strings_path):
                tree = ET.parse(strings_path)
                root = tree.getroot()
                for string in root.findall(".//string"):
                    text = (string.text or "").lower()
                    name = (string.get("name", "") or "").lower()

                    for pattern in DELETION_PATTERNS:
                        if pattern in text or pattern in name:
                            finding = {
                                "matched_pattern": pattern,
                                "name": string.get("name", ""),
                                "text": string.text,
                            }
                            deletion_strings.append(finding)
                            log_finding(
                                app_name,
                                "String Resource",
                                f"Found in strings.xml:\nID: {finding['name']}\nText: {finding['text']}",
                            )
    except Exception as e:
        print(f"Error analyzing strings.xml for {app_name}: {str(e)}", file=sys.stderr)
    return deletion_strings

Function to analyse the smali files: 

In [51]:
def process_smali_file(args):
    """Enhanced Smali file analysis"""
    file_path, app_name = args
    findings = {"keyword_matches": [], "api_matches": [], "context": []}

    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read().lower()

            # Check for keyword matches using complete phrases
            for pattern in DELETION_PATTERNS:
                if not pattern in content:
                    continue

                findings["keyword_matches"].append(pattern)

                # Get context (5 lines before and after)
                lines = content.split("\n")
                for i, line in enumerate(lines):
                    if not pattern in line.lower():
                        continue

                    start = max(0, i - 5)
                    end = min(len(lines), i + 6)
                    context = "\n".join(lines[start:end])
                    findings["context"].append({"match": pattern, "context": context})

                    log_finding(
                        app_name,
                        "Code Pattern",
                        f"File: {file_path}\nPattern: {pattern}\nContext:\n{context}",
                    )

            # Check for API patterns
            for pattern in API_PATTERNS:
                matches = re.finditer(pattern, content, re.IGNORECASE)
                for match in matches:
                    findings["api_matches"].append(match.group())

                    # Get context
                    lines = content.split("\n")
                    line_no = content[: match.start()].count("\n")
                    start = max(0, line_no - 5)
                    end = min(len(lines), line_no + 6)
                    context = "\n".join(lines[start:end])
                    findings["context"].append(
                        {"match": match.group(), "context": context}
                    )

                    log_finding(
                        app_name,
                        "API Pattern",
                        f"File: {file_path}\nAPI: {match.group()}\nContext:\n{context}",
                    )

            if findings["keyword_matches"] or findings["api_matches"]:
                return file_path, findings

    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}", file=sys.stderr)
    return None

Function to analyse one decompiled APK:

In [52]:
def analyze_app(app_dir, app_name):
    """Analyze a single app directory"""
    findings = {
        "app_name": app_name,
        "smali_analysis": [],
        "strings_analysis": [],
        "total_files_processed": 0,
    }

    # Analyze string files
    print(f"\nAnalyzing string files for {app_name}...")
    findings["strings_analysis"] = analyze_strings_xml(app_dir, app_name)

    # Get all smali files
    smali_files = []
    for root, _, files in os.walk(app_dir):
        for file in files:
            if file.endswith(".smali"):
                smali_files.append((os.path.join(root, file), app_name))

    total_files = len(smali_files)
    if total_files == 0:
        print(f"No .smali files found for {app_name}")
        return findings

    # Process smali files in parallel
    num_cpus = int(os.environ.get("SLURM_CPUS_PER_TASK", 16))
    num_workers = min(num_cpus, 8)

    print(
        f"Processing {total_files} smali files for {app_name} using {num_workers} parallel processes"
    )

    with tqdm(total=total_files, desc=f"Processing {app_name}", unit="file") as pbar:
        try:
            chunk_size = max(1, total_files // (num_workers * 4))

            with ProcessPoolExecutor(max_workers=num_workers) as executor:
                for result in executor.map(
                    process_smali_file, smali_files, chunksize=chunk_size
                ):
                    if result:
                        file_path, file_findings = result
                        findings["smali_analysis"].append(
                            {"file": file_path, "findings": file_findings}
                        )
                    pbar.update(1)
                    findings["total_files_processed"] += 1

        except Exception as e:
            print(
                f"Error in parallel processing for {app_name}: {str(e)}",
                file=sys.stderr,
            )

    with open(f"findings/{app_name}.json", "w", encoding="utf-8") as f:
        f.write(json.dumps(findings))

    return findings

## Run the analysis

Now we can run the analysis on all our APK files: 

In [53]:
# Clear or create the log file
with open(LOG_FILE, "w", encoding="utf-8") as f:
    f.write("Account Deletion Analysis Log\n")
    f.write("=" * 30 + "\n")

# Get all app directories
app_dirs = get_app_directories(SMALI_DIR)
all_findings = []

# Process each app
for app_name in app_dirs:
    app_dir = os.path.join(SMALI_DIR, app_name)
    findings = analyze_app(app_dir, app_name)
    all_findings.append(findings)

# Generate and save summary report
summary_report = generate_summary_report(all_findings)
with open("account_deletion_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary_report)


Analyzing string files for com.tado...
Processing 6254 smali files for com.tado using 8 parallel processes


Processing com.tado:   0%|          | 0/6254 [00:00<?, ?file/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea


Analyzing string files for mynt.app...


KeyboardInterrupt: 

## Use an LLM 

For apps where our analysis yields a low confidence, we double-check with an LLM
to see whether or not the found relevant code and strings are functionality for
account deletion.

In [3]:
low_likelihood_apps = ['mynt.app', 'com.wooxhome.smart', 'nz.co.stuff.android.news', 'linko.home']

First, import our AccountDeletionAnalyser, which is a publicly available model
from hugging face, which we give the data we found in our analysis earlier. 

In [4]:
from llm_analyser import AccountDeletionAnalyser

analyser = AccountDeletionAnalyser()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Now let's check all our "low" confidence apps.

In [8]:
for app in low_likelihood_apps:
    with open(f"findings/{app}.json", "r") as f:
        data = json.load(f)

    if data.get('smali_analysis') and data['smali_analysis'][0].get('findings') and data['smali_analysis'][0]['findings'].get('context'):
        smali_code = data['smali_analysis'][0]['findings']['context'][0] 
    else: 
        smali_code = data['smali_analysis']

    strings_xml = data['strings_analysis']

    result = analyser.analyse_code_snippet(smali_code, strings_xml)

    print("Full Analysis:", result["full_analysis"])
    print(
        "Likelihood of Account Deletion:",
        (
            f"{result['likelihood_percentage']}%"
            if result["likelihood_percentage"]
            else "Could not determine"
        ),
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Full Analysis: Analyze the following code snippets for account deletion functionality:

Smali Code:
{'match': '/user/delete', 'context':'       "lrideatom/app/data/user/sumsub;",\n        "i",\n        "(ljava/lang/string;lnp/continuation;)ljava/lang/object;",\n        "lrideatom/app/data/user/user;",\n        "n",\n        "lrideatom/app/data/user/deletecardresponse;",\n        "k",\n        "email",\n        "name",\n        "",\n        "agreemarketing",'}

Strings XML:
[]

Task: Determine the likelihood (0-100%) that this code relates to user account
deletion. 
Respond in the following format where you replace the <likelihood> with your
determined likelihood and <reason> with a detailed explanation of your reasoning: 

"Likelihood of Account Deletion Functionality: <insert likelihood here>.
The reason for this is: <reason>"

Solution:

Likelihood of Account Deletion Functionality: 100%.
The reason for this is that the code snippet includes a match statement for the "/user/delete" p

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Full Analysis: Analyze the following code snippets for account deletion functionality:

Smali Code:
{'match': 'terminateaccount', 'context': '.class public final lcom/tuya/smart/personal/account/security/plug/cell/terminateaccountcell$a;\n.super ljava/lang/object;\n.source "terminateaccountcell.kt"\n\n\n# annotations'}

Strings XML:
[{'matched_pattern': 'deactivate account', 'name': 'terminate_account', 'text': 'Deactivate Account'}, {'matched_pattern': 'delete account', 'name': 'ty_logoff_query_sure', 'text': 'Are you sure to delete account?'}, {'matched_pattern': 'delete account', 'name': 'ty_logoff_title', 'text': 'Delete Account'}]

Task: Determine the likelihood (0-100%) that this code relates to user account
deletion. 
Respond in the following format where you replace the <likelihood> with your
determined likelihood and <reason> with a detailed explanation of your reasoning: 

"Likelihood of Account Deletion Functionality: <insert likelihood here>.
The reason for this is: <reason

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Full Analysis: Analyze the following code snippets for account deletion functionality:

Smali Code:
{'match':'removeaccount', 'context': '\n    move-result-object v1\n\n    iget-object v2, v0, lcom/gigya/socialize/android/gswebbridge;->accountslistener:lcom/gigya/socialize/android/event/gsaccountseventlistener;\n\n    invoke-virtual {v1, v2}, lcom/gigya/socialize/android/gsapi;->removeaccountslistener(lcom/gigya/socialize/android/event/gsaccountseventlistener;)v\n\n   .line 132\n    invoke-static {}, lcom/gigya/socialize/android/gsapi;->getinstance()lcom/gigya/socialize/android/gsapi;\n\n    move-result-object v1'}

Strings XML:
[]

Task: Determine the likelihood (0-100%) that this code relates to user account
deletion. 
Respond in the following format where you replace the <likelihood> with your
determined likelihood and <reason> with a detailed explanation of your reasoning: 

"Likelihood of Account Deletion Functionality: <insert likelihood here>.
The reason for this is: <reason>"

