# Cucumber Steps Analysis Notebook

## Regex + Fuzzy Matcher of SDK Implemented Steps Against Actual Feature Steps

### Input - use the `make display-...` commands in each SDK
### Final outputs look like this [Google spreadsheet](https://docs.google.com/spreadsheets/d/1Szfvw6_OV0cTz-IEN9bTop5ZW2i472D-pmIHyPVJing/edit#gid=737750452)

In [None]:
# also need to install the various Jupyter dependencies and pandas

%pip install gherkin-official
%pip install fuzzywuzzy
%pip install pandas

# needed for faster fuzzywuzzy:
%pip install python-Levenshtein

In [None]:
from dataclasses import dataclass
from functools import reduce
from fuzzywuzzy import fuzz, process as fzp
import gherkin
import gherkin.parser
import pandas as pd
from pathlib import Path

## Step 1: parse all Gherkin features

In [None]:
def get_features_df():
    units = (Path.cwd().parent / "features" / "unit").glob("*.feature")
    integrations = (Path.cwd().parent / "features" / "integration").glob("*feature")
    dfu = pd.DataFrame(data=[{"test_type": "unit", "feature_path": line} for line in units])
    dfi = pd.DataFrame(data=[{"test_type": "integration", "feature_path": line} for line in integrations])
    feat_df = pd.concat([dfu, dfi])
    feat_df["feature"] = feat_df.feature_path.apply(lambda s: s.name)
    return feat_df.set_index("feature")

In [None]:
feat_df = get_features_df()
feat_df

In [None]:
@dataclass(eq=True, order=True, frozen=True)
class Step:
    keyword: str
    text: str

def parse_features(feature_path):
    parser = gherkin.parser.Parser()
    with open(feature_path) as f:
        feature_info = gherkin.token_scanner.TokenScanner(f.read())
    return parser.parse(feature_info)

def extract_all(key: str, d: dict, append_method="extend") -> list:
    extracted = []
    def extractor(_d):
        if key in _d:
            v = _d[key]
            if append_method == "extend":
                extracted.extend(v)
            else:
                extracted.append(v)
        for k, v in _d.items():
            if k != key and isinstance(v, dict):
                extractor(v)
            elif isinstance(v, list):
                for a in v:
                    if isinstance(a, dict):
                        extractor(a)
    extractor(d)
    return extracted

def uniq(s: list):
    return sorted(list(set(s)))

def extract_tags(feature_info: dict) -> list:
    return uniq([tag["name"] for tag in extract_all("tags", feature_info)])

def extract_steps(feature_info: dict) -> list:
    return uniq([Step(step["keyword"],step["text"]) for step in extract_all("steps", feature_info)])

#### Example: extract cucumber info for `c2c.feature`

In [None]:
feature="c2c.feature"
feat_df = get_features_df()
eg_feature = feat_df[feat_df.index == feature].feature_path.iloc[0]

feature_info = parse_features(eg_feature)
tags = extract_tags(feature_info)
steps = extract_steps(feature_info)

tags, steps, feature_info

### All the Tags

In [None]:
def append_tags(feat_df):
    def mapper(p):
        feature_info = parse_features(p)
        return extract_tags(feature_info)
    
    feat_df["tags"] = feat_df.feature_path.apply(mapper)

tags_df = get_features_df()
append_tags(tags_df)

tags_df[['test_type', 'tags']]

In [None]:
all_tags = uniq(reduce(lambda l,x : l+x, tags_df.tags, []))
print(all_tags)

### All the Steps

In [None]:
def get_step2feat(feat_df):
    def mapper(row):
        feature_info = parse_features(row.feature_path)
        return (row.feature, extract_steps(feature_info))
    
    feat_df = feat_df.reset_index()
    steps = feat_df.apply(mapper, axis=1)
    steps = pd.DataFrame(reduce(lambda xs, x: xs + [{"feature": x[0], "step": y} for y in x[1]], steps, []))
    steps["gwt"] = steps.step.apply(lambda step: step.keyword)
    steps["step"] = steps.step.apply(lambda step: step.text)

    feat_df = feat_df.set_index("feature")
    steps = steps.set_index("feature")

    return feat_df.join(steps)

def fill_step_templates(steps_df):
    subs = {
        r'"<[^>]*>"': '"hello"',
        r'<[^(][^>]*>': '42',
    }
    steps_df["filled_step"] = steps_df.step.replace(subs.keys(), subs.values(), regex=True)
    return steps_df

In [None]:
all_steps = get_step2feat(get_features_df())
all_steps = fill_step_templates(all_steps)
all_steps

In [None]:
filled_steps = all_steps.groupby(by="filled_step").count()
filled_steps = filled_steps[["step"]].rename(columns={"step": "count"}).sort_values(by="count", ascending=False)
filled_steps = filled_steps.reset_index()
filled_steps

In [None]:
# filled_steps.to_clipboard()

## Step 2: parse the SDK-steps summaries into Pandas

## 2A) Java

### e.g., created in the Java SDK via 
```sh
make display-all-java-steps
```

### i.e.:
```sh
find . 2>/dev/null | xargs grep "io.cucumber.java.en" 2>/dev/null | grep -v Binary | cut -d: -f1 | sort | uniq | xargs grep -E "@(Given|Then|When)"
```

#### `java_source2step`

In [None]:
# keeping a small sample for illustrative purposes:

java_source2step = """./src/test/java/com/algorand/algosdk/cucumber/shared/TransactionSteps.java:    @When("I build an application transaction with operation {string}, application-id {long}, sender {string}, approval-program {string}, clear-program {string}, global-bytes {long}, global-ints {long}, local-bytes {long}, local-ints {long}, app-args {string}, foreign-apps {string}, foreign-assets {string}, app-accounts {string}, fee {long}, first-valid {long}, last-valid {long}, genesis-hash {string}, extra-pages {long}")
./src/test/java/com/algorand/algosdk/integration/Stepdefs.java:    @Then("I do my part")"""

### `javasdk_df`

In [None]:
java_source_and_step = [s2s.split(":    ") for s2s in java_source2step.split("\n")]
javasdk_df = pd.DataFrame(data=[{"source": line[0], "raw_step": line[1]} for line in java_source_and_step])
javasdk_df[["gwt","step"]] = javasdk_df.raw_step.str.extract(r'@([^(]*)."([^"]*)"')
javasdk_df

### Rexify - `javasdk_rex_df`

In [None]:
subs = {
    ')': '\)',
    '{string}': '"([^"]*)"',
    '{int}': '(\d+)',
    '{long}': '(\d+)',
    '{biginteger}': '(\d+)',
}
javasdk_rex_df = javasdk_df.copy()
for k, v in subs.items():
    javasdk_rex_df["step"] = javasdk_rex_df.step.str.replace(k, v, regex=False)
javasdk_rex_df

In [None]:
# javasdk_rex_df.to_clipboard()

## 2B) Python

### e.g., created in the Python SDK via
```sh
make display-all-python-steps
```

### i.e.:
```sh
find tests/steps -name "*.py" | xargs grep "behave" 2>/dev/null | cut -d: -f1 | sort | uniq | xargs awk "/@(given|step|then|when)/,/[)]/" | grep -E "(\".+\"|\'.+\')"
```

In [None]:
# keeping a small sample for illustrative purposes:

py_steps = """    'we make an Account Information call against account "{account}" with exclude "{exclude:MaybeString}"'
@when('we make an Account Information call against account "{account}"')
    'we make a Lookup Account by ID call against account "{account}" with round {block}'
    'we make a Lookup Account by ID call against account "{account}" with exclude "{exclude:MaybeString}"'
@when("we make any LookupAccountByID call")
@then('the parsed LookupAccountByID response should have address "{address}"')
@when("we make any Account Information call")
    'the parsed Account Information response should have address "{address}"'
@then("I do my part")"""


### `pysdk_df` - Unprocessed Steps

In [None]:
pysdk_df = pd.DataFrame(data=[{"raw_step": line} for line in py_steps.split("\n")])
pysdk_df[["1", "2"]] = pysdk_df.raw_step.str.extract(r'@(given|step|then|when).(.*).')
pysdk_df["step"] = pysdk_df.apply(lambda row: (row["raw_step"][4:] if pd.isnull(row["2"]) else row["2"])[1:-1], axis=1)
pysdk_df = pysdk_df.drop(["1", "2"], axis=1)
pysdk_df

### Rexify - `pysdk_rex_df`

In [None]:
subs = {
    r'"\{[^}]*\}"': '"([^"]*)"',
    r'\{[^(][^}]*\}': '([0-9]+)',
}
pysdk_rex_df = pysdk_df.copy()
pysdk_rex_df["step"] = pysdk_rex_df.step.replace(subs.keys(), subs.values(), regex=True)
pysdk_rex_df

In [None]:
# pysdk_rex_df.to_clipboard()

## 2C) Go

### e.g., created in the Go SDK via
```sh
make display-all-go-steps
```

### i.e.:
```sh
find test -name "*.go" | xargs grep "github.com/cucumber/godog" 2>/dev/null | cut -d: -f1 | sort | uniq | xargs grep -Eo "Step[(].[^\`]+" | awk '{sub(/:Step\(./,":")} 1' | sed -E 's/", [a-zA-Z0-9]+\)//g'
```

In [None]:
# keeping a small sample for illustrative purposes:

go_source2step ="""test/algodclientv2_test.go:^mock http responses in "([^"]*)" loaded from "([^"]*)"$
test/algodclientv2_test.go:^expect error string to contain "([^"]*)"$
test/algodclientv2_test.go:^we make any Pending Transaction Information call$
test/algodclientv2_test.go:^the parsed Pending Transaction Information response should have sender "([^"]*)"$
test/steps_test.go:I do my part"""

### `gosdk_df` - already Rexify'ed

In [None]:
go_source_and_step = [s2s.split(":") for s2s in go_source2step.split("\n")]
gosdk_df = pd.DataFrame(data=[{"source": line[0], "step": line[1]} for line in go_source_and_step])
gosdk_df

In [None]:
# gosdk_df.to_clipboard()

## 2D) Javascript

### e.g., created in the Javascript SDK via
```sh
make display-all-hs-steps
```

### i.e.:
```sh
tail -n +135 tests/cucumber/steps/steps.js | grep -v '^ *//' | awk "/(Given|Then|When)/,/',/" | grep -E "\'.+\'"  | sed "s/^[^']*'\([^']*\)'.*/\1/g"
```

#### `js_steps`

In [None]:
# keeping a small sample for illustrative purposes:

js_steps = """an algod client
a kmd client
an algod v2 client
wallet information
I get versions with algod
v1 should be in the versions
I get versions with kmd
I get the status
I get status after this block
I can get the block info
payment transaction parameters {int} {int} {int} {string} {string} {string} {int} {string} {string}
mnemonic for private key {string}
we expect the path used to be {string}"""

### `jssdk_df` - Unprocessed

In [None]:
jssdk_df = pd.DataFrame(data=[{"step": line} for line in js_steps.split("\n")])
jssdk_df

In [None]:
# jssdk_df.to_clipboard()

### Rexify - `jssdk_rex_df`

In [None]:
subs = {
    ')': '\)',
    '{string}': '"([^"]*)"',
    '{int}': '(\d+)',
}
jssdk_rex_df = jssdk_df.copy()
for k, v in subs.items():
    jssdk_rex_df["step"] = jssdk_rex_df.step.str.replace(k, v, regex=False)
jssdk_rex_df

In [None]:
# jssdk_rex_df.to_clipboard()

## Step 3) simulate Cucumber regex matcher

In [None]:
def match_stepre2cucumber(step_re, cuke_df):
    matches = cuke_df[cuke_df.filled_step.str.match(step_re)]
    return None if matches.empty else matches.iloc[0].filled_step

jss = jssdk_rex_df.iloc[2]
jsm = match_stepre2cucumber(jss.step, filled_steps)

print(f"""{jss=}
{jsm=}""")

In [None]:
def match_sdk2cucumber(sdk_rex_df, cuke_df, discard_matched=True):
    matches = sdk_rex_df.copy()
    matches["cuke"] = matches.step.apply(lambda step: match_stepre2cucumber(step, cuke_df))
    if discard_matched:
        matches = matches[pd.isna(matches.cuke)]
    return matches

### 3A) `javasdk_rex_unmatched`

In [None]:
javasdk_rex_unmatched = match_sdk2cucumber(javasdk_rex_df, filled_steps)
javasdk_rex_unmatched

### 3B) `pysdk_rex_unmatched`

In [None]:
pysdk_rex_unmatched = match_sdk2cucumber(pysdk_rex_df, filled_steps)
pysdk_rex_unmatched

### 3C) `gosdk_rex_unmatched`

In [None]:
gosdk_rex_unmatched = match_sdk2cucumber(gosdk_df, filled_steps)
gosdk_rex_unmatched

### 3D) `jssdk_rex_unmatched`

In [None]:
jssdk_rex_unmatched = match_sdk2cucumber(jssdk_rex_df, filled_steps)
jssdk_rex_unmatched

In [None]:
print(f"""
--- AFTER APPLYING REGEX MATCHING, WE HAVE THE FOLLOWING COUNTS OF UNMATCHED STEPS. APPLY FUZZY MATCHER IF ANY REMAIN:
* {len(javasdk_rex_unmatched)=}
* {len(pysdk_rex_unmatched)=}
* {len(gosdk_rex_unmatched)=}
* {len(jssdk_rex_unmatched)=}
""")

## Step 4) fuzzy match remaining SDK steps against features

### NOTE: if an SDK's `*_rex_ummatched` is empty, the fuzzy matcher will fail, as there is nothing to match. In that case, there are probably _NO UNUSED STEPS_

In [None]:
def fuzz_step_v_df(step, df, total_scorer=True):
    scorer = fuzz.ratio if total_scorer else fuzz.partial_ratio
    return fzp.extractOne(step, df.step, scorer=scorer)

def fuzzing_algo(samples_df, scoring_df):
    """
    samples_df - "unknown" strings to score
    scoring_df - "univierse" of known strings to search and find the "best_match"
    """
    def matcher(left, total_scorer: bool):
        matches = []
        cols = ["match", "score", "idx"]
        if total_scorer:
            cols = list(map(lambda s: f"total_{s}", cols))

        def fuzzer(row):
            return fuzz_step_v_df(row.step, scoring_df, total_scorer=total_scorer)

        msi = left.apply(fuzzer, axis=1, result_type='expand').rename(columns=dict(enumerate(cols)))
        return pd.concat([left, msi], axis=1)
    
    res = matcher(samples_df, total_scorer=True)
    res = res[res.total_score < 100]

    res = matcher(res, total_scorer=False)    
    return res

### 4A) fuzzy logic against javasdk

In [None]:
java_remainder = fuzzing_algo(javasdk_rex_unmatched, all_steps)
java_remainder = java_remainder.sort_values(by='score')
java_remainder

In [None]:
# java_remainder.to_clipboard()

### 4B) fuzzy logic against pysdk

In [None]:
py_remainder = fuzzing_algo(pysdk_rex_unmatched, all_steps)
py_remainder = py_remainder.sort_values(by='score')
py_remainder

In [None]:
# py_remaninder.to_clipboard()

### 4C) fuzzy logic against gosdk

In [None]:
go_remainder = fuzzing_algo(gosdk_rex_unmatched, all_steps)
go_remainder = go_remainder.sort_values(by='score')
go_remainder

In [None]:
# go_remainder.to_clipboard()

### 4D) fuzzy logic against jssdk

In [None]:
js_remainder = fuzzing_algo(jssdk_rex_unmatched, all_steps)
js_remainder = js_remainder.sort_values(by='score')
js_remainder

In [None]:
# js_remainder.to_clipboard()