In [None]:
import pandas as pd
from collections import Counter

# Read the CSV file
file_path = './target/mapped.csv'
df = pd.read_csv(file_path)

# Combine the "issue" and "solution" columns
issues = df['issue'].dropna().str.lower().str.strip().str.split(',').explode()
solutions = df['solution'].dropna().str.lower().str.strip().str.split(',').explode()

# Count the frequency of each piece of string
issue_counts = Counter(issues.str.strip())
solution_counts = Counter(solutions.str.strip())

# Frequency

In [None]:
issues_tax = pd.read_csv('./issue_theme_summary.csv')
issue_sub_counts = Counter()
issue_theme_counts = Counter()
for issue, count in issue_counts.items():
    issue_mapping = issues_tax[issues_tax["Code"].str.strip().str.lower() == issue.lower()]
    sub_theme = issue_mapping["Sub"].values.tolist()
    if not sub_theme:
        print(f">> Unknown issue: {issue}")
    else:
        issue_sub_counts[sub_theme[0]] += count
        issue_theme_counts[issue_mapping["Theme"].str.strip().values.tolist()[0]] += count

print("\nIssue Sub-Theme Frequencies:")
for category, count in issue_sub_counts.most_common():
    print(f"{category}: {count}")

print("\nIssue Theme Frequencies:")
for category, count in issue_theme_counts.most_common():
    print(f"{category}: {count}")

In [None]:
solution_tax = pd.read_csv('./solution_theme_summary.csv')
solution_sub_counts = Counter()
solution_theme_counts = Counter()
for solution, count in solution_counts.items():
    solution_mapping = solution_tax[solution_tax["Code"].str.strip().str.lower() == solution.lower()]
    sub_theme = solution_mapping["Sub"].values.tolist()
    if not sub_theme:
        print(f">> Unknown solution: {solution}")
    else:
        solution_sub_counts[sub_theme[0]] += count
        solution_theme_counts[solution_mapping["Theme"].str.strip().values.tolist()[0]] += count

print("\nsolution Sub-Theme Frequencies:")
for category, count in solution_sub_counts.most_common():
    print(f"{category}: {count}")

print("\nsolution Theme Frequencies:")
for category, count in solution_theme_counts.most_common():
    print(f"{category}: {count}")


# Coverage

In [None]:
issues_tax = pd.read_csv('./issue_theme_summary.csv')

# Create mappings for efficient lookup
issue_to_sub = pd.Series(
    issues_tax.Sub.values,
    index=issues_tax.Code.str.strip().str.lower()
).to_dict()
issue_to_theme = pd.Series(
    issues_tax.Theme.str.strip().values,
    index=issues_tax.Code.str.strip().str.lower()
).to_dict()

# Initialize counters for coverage
sub_theme_coverage = Counter()
theme_coverage = Counter()

# Filter dataframe and get total document count
docs_with_issues = df.dropna(subset=['issue'])
total_docs = 753

# Iterate over each document to calculate coverage
for _, row in docs_with_issues.iterrows():
    issues_in_doc = [issue.strip().lower() for issue in row['issue'].split(',') if issue.strip()]

    # Find unique sub-themes and themes in the current document
    doc_sub_themes = set()
    doc_themes = set()

    for code in issues_in_doc:
        sub = issue_to_sub.get(code)
        theme = issue_to_theme.get(code)
        if sub:
            doc_sub_themes.add(sub)
        if theme:
            doc_themes.add(theme)

    # Increment coverage count for each unique theme found
    for sub in doc_sub_themes:
        sub_theme_coverage[sub] += 1
    for theme in doc_themes:
        theme_coverage[theme] += 1

print(f"\nTotal documents with issues: {total_docs}")

print("\nIssue Sub-Theme Coverage:")
for category, count in sub_theme_coverage.most_common():
    percentage = (count / total_docs) * 100
    print(f"{category}: {count} ({percentage:.2f}%)")

print("\nIssue Theme Coverage:")
for category, count in theme_coverage.most_common():
    percentage = (count / total_docs) * 100
    print(f"{category}: {count} ({percentage:.2f}%)")

In [None]:
solution_tax = pd.read_csv('./solution_theme_summary.csv')

# Create mappings for efficient lookup
solution_to_sub = pd.Series(
    solution_tax.Sub.values,
    index=solution_tax.Code.str.strip().str.lower()
).to_dict()
solution_to_theme = pd.Series(
    solution_tax.Theme.str.strip().values,
    index=solution_tax.Code.str.strip().str.lower()
).to_dict()

# Initialize counters for coverage
solution_sub_theme_coverage = Counter()
solution_theme_coverage = Counter()

# Filter dataframe and get total document count
docs_with_solutions = df.dropna(subset=['solution'])
total_docs = 753

# Iterate over each document to calculate coverage
for _, row in docs_with_solutions.iterrows():
    solutions_in_doc = [solution.strip().lower() for solution in row['solution'].split(',') if solution.strip()]

    # Find unique sub-themes and themes in the current document
    doc_sub_themes = set()
    doc_themes = set()

    for code in solutions_in_doc:
        sub = solution_to_sub.get(code)
        theme = solution_to_theme.get(code)
        if sub:
            doc_sub_themes.add(sub)
        if theme:
            doc_themes.add(theme)

    # Increment coverage count for each unique theme found
    for sub in doc_sub_themes:
        solution_sub_theme_coverage[sub] += 1
    for theme in doc_themes:
        solution_theme_coverage[theme] += 1

print(f"\nTotal documents with solutions: {total_docs}")

print("\nSolution Sub-Theme Coverage:")
for category, count in solution_sub_theme_coverage.most_common():
    percentage = (count / total_docs) * 100
    print(f"{category}: {count} ({percentage:.2f}%)")

print("\nSolution Theme Coverage:")
for category, count in solution_theme_coverage.most_common():
    percentage = (count / total_docs) * 100
    print(f"{category}: {count} ({percentage:.2f}%)")

In [None]:
file_path = './target/mapped.csv'
df = pd.read_csv(file_path)

# Create issue-solution mapping
issue_solution_pairs = []

# Process each row in the dataset
for _, row in df.dropna(subset=['issue', 'solution']).iterrows():
    # Extract issues and solutions from the row
    row_issues = [issue.strip().lower() for issue in row['issue'].split(',') if issue.strip()]
    row_solutions = [solution.strip().lower() for solution in row['solution'].split(',') if solution.strip()]

    # Create all possible pairs for this row
    for issue in row_issues:
        for solution in row_solutions:
            issue_solution_pairs.append((issue, solution))

# Count the frequency of each pair
pair_counts = Counter(issue_solution_pairs)

# Create and display the output
# print("from_code,to_code,count")
# for (issue, solution), count in pair_counts.items():
#     print(f"{issue},{solution},{count}")

data_rows = []

print("from_code,to_code,issue_sub_theme,solution_sub_theme,count")
for (issue, solution), count in pair_counts.items():
    # Get issue sub-theme
    issue_mapping = issues_tax[issues_tax["Code"].str.strip().str.lower() == issue.lower()]
    issue_sub_theme = issue_mapping["Sub"].values.tolist()
    issue_sub = issue_sub_theme[0] if issue_sub_theme else "Unknown"

    # Get solution sub-theme
    solution_mapping = solution_tax[solution_tax["Code"].str.strip().str.lower() == solution.lower()]
    solution_sub_theme = solution_mapping["Sub"].values.tolist()
    solution_sub = solution_sub_theme[0] if solution_sub_theme else "Unknown"

    print(f"{issue},{solution},{issue_sub},{solution_sub},{count}")
    data_rows.append({
        "issue": issue,
        "solution": solution,
        "issue_sub_theme": issue_sub,
        "solution_sub_theme": solution_sub,
        "count": count
    })

# Create DataFrame from the collected data
pairs_df = pd.DataFrame(data_rows)

# Display the DataFrame
print(pairs_df.head())
print(f"Total rows: {len(pairs_df)}")

pairs_df.drop(columns=['issue', 'solution'], inplace=True)
pairs_df.groupby(['issue_sub_theme', 'solution_sub_theme']).sum().reset_index()


import seaborn as sns
import matplotlib.pyplot as plt

issue_subs = (pairs_df['issue_sub_theme'].str.replace('concerns', '', case=False).str.strip())
              # .str.replace('security', '',case=False).str.strip(),)
solution_subs = pairs_df['solution_sub_theme'].str.replace('concerns', '', case=False).str.strip(),

# Assuming pairs_df has columns: 'issue_sub_theme', 'solution_sub_theme', 'count'
pivot_table = pairs_df.pivot_table(
    index=solution_subs,
    columns=issue_subs,
    # index=issue_subs,
    # columns=solution_subs,
    values='count',
    fill_value=0
)

# Normalize to percentage by row (issue_sub_theme)
pivot_table_percent = pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100

plt.figure(figsize=(12, 8))
sns.heatmap(
    pivot_table_percent, annot=True, fmt=".1f", cmap="crest",
    linewidth=.5,
    square=True,
    # cbar_kws={"shrink": 0.5}
)
plt.xlabel('Issue Sub-Themes')
plt.ylabel('Solution Sub-Themes')

# plt.xlabel('Solution Sub-Themes')
# plt.ylabel('Issue Sub-Themes')
plt.title('Heatmap of Issue vs Solution Sub-Themes')
# plt.show()

plt.subplots_adjust(bottom=0.4)
plt.xticks(rotation=45, ha='right')

plt.savefig('./target/issue_solution_heatmap.pdf', bbox_inches='tight', pad_inches=0)
plt.show()