In [17]:
import os
import yaml
import pandas as pd
import json
from collections import Counter
from itertools import chain

# === CONFIG ===
EMULATOR_KEYWORDS = [
    "emulator", "avdmanager", "adb", "instrumentation", "start emulator",
    "create avd", "android-emulator", "run emulator", "wait-for-device"
]

PROJECTS_DIR = r"C:\Users\Admin\OneDrive\Education\Master of Info - Thesis\Config Files"
OUTPUT_DIR = r"C:\GitHub\Android-Mobile-Apps"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === DATA STRUCTURES ===
records = []
file_groups = []
jsonl_matches = []

# === MAIN PARSER ===
def extract_yaml_steps(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = yaml.safe_load(f)

        if not content:
            file_groups.append({'file': file_path, 'group': 'empty or invalid YAML'})
            return

        parsed_ok = False
        jobs = content.get('jobs', None)

        # === STANDARD FORMAT ===
        if isinstance(jobs, dict):
            for job_name, job_data in jobs.items():
                if not isinstance(job_data, dict):
                    file_groups.append({'file': file_path, 'group': 'invalid job structure'})
                    return

                steps = job_data.get('steps', [])
                if not isinstance(steps, list):
                    file_groups.append({'file': file_path, 'group': 'invalid steps structure'})
                    return

                for step in steps:
                    if not isinstance(step, dict):
                        continue
                    step_json = json.dumps(step, indent=2)
                    if any(keyword in step_json.lower() for keyword in EMULATOR_KEYWORDS):
                        parsed_ok = True
                        match = {
                            'file': file_path,
                            'job': job_name,
                            'step_name': step.get('name', ''),
                            'matched_keywords': [k for k in EMULATOR_KEYWORDS if k in step_json.lower()],
                            'detection_method': 'standard',
                            'full_step_json': step_json
                        }
                        records.append(match)
                        jsonl_matches.append(match)

        # === FALLBACK FORMAT: TOP-LEVEL STEPS ===
        elif isinstance(content, dict):
            steps = content.get('steps', [])
            if isinstance(steps, list):
                for step in steps:
                    if not isinstance(step, dict):
                        continue
                    step_json = json.dumps(step, indent=2)
                    if any(keyword in step_json.lower() for keyword in EMULATOR_KEYWORDS):
                        parsed_ok = True
                        match = {
                            'file': file_path,
                            'job': 'top_level',
                            'step_name': step.get('name', ''),
                            'matched_keywords': [k for k in EMULATOR_KEYWORDS if k in step_json.lower()],
                            'detection_method': 'top_level_fallback',
                            'full_step_json': step_json
                        }
                        records.append(match)
                        jsonl_matches.append(match)

        # === DEEP FALLBACK: FULL FILE SCAN ===
        if not parsed_ok:
            flat_content = yaml.dump(content)
            if any(keyword in flat_content.lower() for keyword in EMULATOR_KEYWORDS):
                parsed_ok = True
                match = {
                    'file': file_path,
                    'job': 'full_file_scan',
                    'step_name': '',
                    'matched_keywords': [k for k in EMULATOR_KEYWORDS if k in flat_content.lower()],
                    'detection_method': 'full_file_scan',
                    'full_step_json': 'Matched by full file scan'
                }
                records.append(match)
                jsonl_matches.append(match)

        # === CLASSIFY FILE ===
        group_label = (
            'parsed successfully' if parsed_ok else
            'not parsed or empty'
        )
        file_groups.append({'file': file_path, 'group': group_label})

    except yaml.YAMLError:
        file_groups.append({'file': file_path, 'group': 'not parsed or empty'})
    except Exception:
        file_groups.append({'file': file_path, 'group': 'not parsed or empty'})

# === DIRECTORY SCAN ===
def scan_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(('.yml', '.yaml')):
                file_path = os.path.join(root, file)
                extract_yaml_steps(file_path)

# === RUN ===
scan_directory(PROJECTS_DIR)

# === EXPORT MAIN CSV FILES ===
pd.DataFrame(records).to_csv(os.path.join(OUTPUT_DIR, "emulator_steps_summary.csv"), index=False)
pd.DataFrame(file_groups).to_csv(os.path.join(OUTPUT_DIR, "file_grouping_summary.csv"), index=False)

# === EXPORT JSONL FILE ===
jsonl_path = os.path.join(OUTPUT_DIR, "matched_steps.jsonl")
with open(jsonl_path, 'w', encoding='utf-8') as f:
    for item in jsonl_matches:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

# === Save files with YAML errors ===
error_files = [f['file'] for f in file_groups if f['group'] == 'not parsed or empty']
pd.DataFrame({'error_file': error_files}).to_csv(os.path.join(OUTPUT_DIR, "yaml_parse_errors.csv"), index=False)

# === KEYWORD FREQUENCY BREAKDOWN BY DETECTION METHOD ===
df_records = pd.DataFrame(records)
keyword_stats = []

for method in df_records['detection_method'].unique():
    rows = df_records[df_records['detection_method'] == method]
    keyword_list = list(chain.from_iterable(rows['matched_keywords']))
    counter = Counter(keyword_list)
    for keyword, count in counter.items():
        keyword_stats.append({
            'detection_method': method,
            'keyword': keyword,
            'count': count
        })

df_keyword_stats = pd.DataFrame(keyword_stats).sort_values(by=['detection_method', 'count'], ascending=[True, False])
df_keyword_stats.to_csv(os.path.join(OUTPUT_DIR, "keyword_frequency_by_detection_method.csv"), index=False)

# === LOG SUMMARY ===
print(f"\n✅ Extracted {len(records)} emulator-related steps.")
print("📄 Steps saved to: emulator_steps_summary.csv")
print("📂 File grouping saved to: file_grouping_summary.csv")
print("🧾 JSON matches saved to: matched_steps.jsonl")
print("📉 Keyword stats saved to: keyword_frequency_by_detection_method.csv")
print("⚠️ YAML parse errors saved to: yaml_parse_errors.csv")



✅ Extracted 471 emulator-related steps.
📄 Steps saved to: emulator_steps_summary.csv
📂 File grouping saved to: file_grouping_summary.csv
🧾 JSON matches saved to: matched_steps.jsonl
📉 Keyword stats saved to: keyword_frequency_by_detection_method.csv
⚠️ YAML parse errors saved to: yaml_parse_errors.csv
