In [None]:
from __future__ import annotations
from typing import List
import csv

class Group:
    def __init__(self, title: str = None, total_time: float = None, members: List[str] = None):
        self.total_time = total_time
        self.title = title
        self.members = members if members is not None else []
    
    def __str__(self):
        members_str = ", ".join(self.members)
        res = f"Group time: "
        if self.title:
            res += f"{self.title}: "
        res += f"{self.total_time}\n"
        if sum([int(member) for member in self.members if member != '']) > 0:
            res += f"Members: {members_str}\n"
        return res

class Time:
    def __init__(self, total_time: float = None, groups: List[Group] = None):
        self.total_time = total_time
        self.groups = groups if groups is not None else []
    
    def __str__(self):
        return f"Total time: {self.total_time}\nGroups:\n{''.join([str(group) for group in self.groups])}"

class Reply:
    def __init__(self, row, survey:Survey = None):
        index = 0
        index_dict_iterator = iter(survey.index_dict)

        # # print newline separated entire row
        # for i in range(len(row)):
        #     print(f"{i}: {row[i]} ({survey.labels[i]})")
        
        section = next(index_dict_iterator)     # section = 'metadata'
        members = survey.index_dict[section]    # members = ['Response ID', ...]
        member_iterator = iter(members)         # member_iterator = iter(['Response ID', ...])
        end = index + len(members)
        while index < end:
            key = next(member_iterator)       # value = 'Response ID'
            value = row[index]
            if key in survey.answer_code_dict:
                if value in survey.answer_code_dict[key]:
                    value = survey.answer_code_dict[key][value]
            if key == "Response ID":
                self.response_id = value
            elif key == "Date submitted":
                self.date_submitted = value
            elif key == "Last page":
                self.last_page = value
            elif key == "Start language":
                self.start_language = value
            elif key == "Seed":
                self.seed = value
            elif key == "Date started":
                self.date_started = value
            elif key == "Date last action":
                self.date_last_action = value
            elif key == "Referrer URL":
                self.referrer_url = value
            elif key == "tool" or key == "tool_other":
                # check if self.tool exists. If not, create it as row[index]
                if not hasattr(self, "tool"):
                    self.tool = value
                elif self.tool == "-oth-":
                    self.tool = value
            index += 1

        self.scores = []

        section = next(index_dict_iterator)     # section = 'requirements'
        members = survey.index_dict[section]    # members = [1, 2, ...]
        end = index + len(members)
        while index < end:
            self.scores.append(row[index])
            index += 1

        section = next(index_dict_iterator)     # section = 'requirements'
        members = survey.index_dict[section]    # members = ['age', 'education', ...]
        member_iterator = iter(members)         # member_iterator = iter(['Response ID', ...])
        end = index + len(members)
        while index < end:
            key = str(next(member_iterator))  # value = 'Response ID'
            value = row[index]
            if key in survey.answer_code_dict:
                if value in survey.answer_code_dict[key]:
                    value = survey.answer_code_dict[key][value]
            if key == "age":
                self.age = value
            elif key == "education" or key == "education_other":
                # check if self.tool exists. If not, create it as row[index]
                if not hasattr(self, "education"):
                    self.education = value
                elif  self.education == "-oth-":
                    self.education = value
            elif key == "survey_count":
                self.survey_count = value
            elif key == "experience":
                self.experience = value
            elif key == "comments":
                self.comments = value
            index += 1

        # the time is an object and holds nested values for the 10 groups
        self.time = Time()
        self.time.total_time = row[index]
        index += 1

        # iterate over groups
        while index < len(row)-1:
            if survey.labels[index].startswith("Group time:"):
                title = survey.labels[index].replace("Group time: ", "")
                self.time.groups.append(Group(title))
                self.time.groups[-1].total_time = row[index]
            else:
                self.time.groups[-1].members.append(row[index])
            index += 1
        
    def __str__(self):
        res = f"Reply: {self.response_id}\nDate submitted: {self.date_submitted}\nLast page: {self.last_page}\nStart language: {self.start_language}\nSeed: {self.seed}\nDate started: {self.date_started}\nDate last action: {self.date_last_action}\nRefer0rer URL: {self.referrer_url}\n"
        if hasattr(self, "tool"):
            res += f"Tool: {self.tool}\n"
        res += f"Scores: {self.scores}\nAge: {self.age}\nEducation: {self.education}\nSurvey count: {self.survey_count}\nExperience: {self.experience}\nComments: {self.comments}\n"
        res += f"{self.time}\n"
        return res
class Survey:
    # different types of survey, with different selection_last_index and deletions
    types = {
        "default": {
            "index_dict": {
                "metadata": [
                    "Response ID",
                    "Date submitted",
                    "Last page",
                    "Start language",
                    "Seed",
                    "Date started",
                    "Date last action",
                    "Referrer URL",
                ],
                "requirements" : [i for i in range(65)],
                "demographics": [
                    "age",
                    "education",
                    "education_other",
                    "survey_count",
                    "experience",
                    "comments",
                ],
                "time" : -1,
            },
            "deletions": [],
            "expected_last_page": "10",
        },
        "555283": {
            "index_dict": {
                "metadata": [
                    "Response ID",
                    "Date submitted",
                    "Last page",
                    "Start language",
                    "Seed",
                    "Date started",
                    "Date last action",
                    "Referrer URL",
                ],
                "requirements" : [i for i in range(65)],
                "demographics": [
                    "age",
                    "education",
                    "education_other",
                    "survey_count",
                    "experience",
                    "comments",
                ],
                "time" : -1,
            },
            "answer_code_dict": {
                "tool": {
                    "A1": "CADIMA",
                    "A2": "Large Language Models (LLM) like ChatGPT, Bing",
                    "A3": "Open Research Knowledge Graph (ORKG)",
                    "A4": "Google Scholar",
                    "A5": "Semantic Scholar",
                    "A6": "Open Knowledge Maps (OKMaps)",
                    "A7": "ConnectedPapers",
                    "A8": "Python",
                    "A9": "Natural Language Processing (NLP) libraries like NLTK, spaCy, FLAIR",
                    "A10": "PDF miners like pdfminer, pypdf2",
                    "A11": "Zotero",
                    "A12": "LaTeX",
                    "A13": "SciKGTeX",
                    "A14": "Colandr",
                    "A15": "Cochrane RevMan",
                    "A16": "covidence",
                    "A17": "rayyan",
                    "A18": "Health Assessment Workspace Collaborative (HAWC) Project",
                    "A19": "metagear",
                    "A20": "Parsifal",
                    "A21": "Systematic Review Data Repository Plus (SRDR+)",
                    "A22": "Sciome",
                    "A23": "Systematic Review Facility (SyRF)",
                    "A24": "interactive Summary of Findings (iSoF)",
                    "A25": "ReLiS",
                    "A26": "SESRA",
                    "A27": "Right Review",
                    "A28": "JBI SUMARI",
                    "A29": "Research Rabbit",
                },
                "education": {
                    "A1": "not yet graduated high school",
                    "A2": "High school",
                    "A3": "Trade school",
                    "A4": "Bachelor’s degree",
                    "A5": "Master’s degree",
                    "A6": "Ph.D. or higher",
                    "A7": "Prefer not to say",
                },
                "survey_count": {
                    "A1": "None",
                    "A2": "1",
                    "A3": "2-3",
                    "A4": "4-5",
                    "A5": "More than 5",
                },
                "experience": {
                    "A1": "Poor",
                    "A2": "Fair",
                    "A3": "Good",
                    "A4": "Very good",
                    "A5": "Excellent",
                },
            },
            "deletions": [51, 56],
            "expected_last_page": "10",
        },
        "628237": {
            "index_dict": {
                "metadata": [
                    "Response ID",
                    "Date submitted",
                    "Last page",
                    "Start language",
                    "Seed",
                    "Date started",
                    "Date last action",
                    "Referrer URL",
                    "tool",
                    "tool_other"
                ],
                "requirements" : [i for i in range(65)],
                "demographics": [
                    "age",
                    "education",
                    "education_other",
                    "survey_count",
                    "experience",
                    "comments",
                ],
                "time" : -1,
            },
            "answer_code_dict": {
                "tool": {
                    "A1": "CADIMA",
                    "A2": "Large Language Models (LLM) like ChatGPT, Bing",
                    "A3": "Open Research Knowledge Graph (ORKG)",
                    "A4": "Google Scholar",
                    "A5": "Semantic Scholar",
                    "A6": "Open Knowledge Maps (OKMaps)",
                    "A7": "ConnectedPapers",
                    "A8": "Python",
                    "A9": "Natural Language Processing (NLP) libraries like NLTK, spaCy, FLAIR",
                    "A10": "PDF miners like pdfminer, pypdf2",
                    "A11": "Zotero",
                    "A12": "LaTeX",
                    "A13": "SciKGTeX",
                    "A14": "Colandr",
                    "A15": "Cochrane RevMan",
                    "A16": "covidence",
                    "A17": "rayyan",
                    "A18": "Health Assessment Workspace Collaborative (HAWC) Project",
                    "A19": "metagear",
                    "A20": "Parsifal",
                    "A21": "Systematic Review Data Repository Plus (SRDR+)",
                    "A22": "Sciome",
                    "A23": "Systematic Review Facility (SyRF)",
                    "A24": "interactive Summary of Findings (iSoF)",
                    "A25": "ReLiS",
                    "A26": "SESRA",
                    "A27": "Right Review",
                    "A28": "JBI SUMARI",
                    "A29": "Research Rabbit",
                },
                "education": {
                    "A1": "not yet graduated high school",
                    "A2": "High school",
                    "A3": "Trade school",
                    "A4": "Bachelor’s degree",
                    "A5": "Master’s degree",
                    "A6": "Ph.D. or higher",
                    "A7": "Prefer not to say",
                },
                "survey_count": {
                    "A1": "None",
                    "A2": "1",
                    "A3": "2-3",
                    "A4": "4-5",
                    "A5": "More than 5",
                },
                "experience": {
                    "A1": "Poor",
                    "A2": "Fair",
                    "A3": "Good",
                    "A4": "Very good",
                    "A5": "Excellent",
                },
            },
            "deletions": [53, 58],
            "expected_last_page": "11",
        },
    }

    # initialize survey
    def __init__(self, labels: List[str] = None, replies: List[Reply] = None, type: str = "default", csv_file: str = None):
        self.labels = labels if labels is not None else []
        self.replies = replies if replies is not None else []
        # dict of indexes of each section
        self.index_dict = {}
        self.deletions = []

        if csv_file is not None:
            self.init_from_csv(csv_file, type)
    
    def __str__(self):
        return f"Type: {self.type}\nLabels: {self.labels}\nReplies: {''.join([str(reply) for reply in self.replies])}"
    
    # init from csv file
    def init_from_csv(self, csv_file: str, type: str = "default"):
        self.type = type
        self.index_dict = self.types[type]["index_dict"]
        self.deletions = sorted(self.types[type]["deletions"], reverse=True)
        expected_last_page = self.types[type]["expected_last_page"]
        self.answer_code_dict = self.types[type]["answer_code_dict"]


        with open(csv_file, 'r') as file:
            csv_reader = csv.reader(file)

            self.labels = next(csv_reader)
            for i in self.deletions:
                del self.labels[i]
                
            rows = []

            for row in csv_reader:
                # delete all columns that are not needed
                if row:
                    for i in self.deletions:
                        del row[i]
                    rows.append(row)
            
            last_page_index = self.labels.index("Last page")

            for i, row in enumerate(rows):
                # Skip rows that did not complete the survey
                if not row or row[last_page_index] != expected_last_page:
                    continue
                # Create a reply object
                reply = Reply(row, self)

                # Append the reply to the list of replies
                self.replies.append(reply)

In [None]:
import glob
import os
from typing import List

# Specify the folder where your CSV files are located
bnw_data_path = os.path.join("c:\\", "workspace", "borgnetzwerk", "tools", "scripts", "SWARM-SLR", "data")
folder_path = os.path.join("c:\\", "workspace", "surveys")

# Create a pattern to match files starting with "results-survey" and ending with ".csv"
file_pattern = os.path.join(folder_path, 'results-survey*.csv')

# Get a list of file paths matching the pattern
csv_files = glob.glob(file_pattern)

surveys:List[Survey] = []

# Iterate through each CSV file
for csv_file in csv_files:
    # print(f"Reading data from file: {csv_file}")
    filename = os.path.basename(csv_file)
    type = filename.split("results-survey")[1].split(".csv")[0]
    surveys.append(Survey(csv_file=csv_file, type=type))

In [None]:
for survey in surveys:
    print(survey)

In [None]:
import matplotlib.pyplot as plt
from  matplotlib.colors import LinearSegmentedColormap

def create_box_plot(data, labels=None, vert=True):
    font = {'size'   : 4}
    # plt.minorticks_on()


    plt.rc('font', **font)
    c = ["#FF7F0E","#FFBB78","#FFBB78","white","#AEC7E8","#AEC7E8","#1F77B4"]
    v = [0,.175,.4,.5,0.6,.825,1.]
    l = list(zip(v,c))

    cmap=LinearSegmentedColormap.from_list('rg',l, N=256)

    # agreement = ["strongly agree", "", "agree", "", "neither agree\nnor disagree", "", "disagree", "", "strongly disagree"]
    agreement = ["strongly agree", "agree", "neither agree\nnor disagree", "disagree", "strongly disagree"]

    if vert:
        fig, ax = plt.subplots(dpi=300, figsize=(30, 1))
        plotlim = plt.xlim(0,len(data)+1) + plt.ylim(10,0)  

        ax.imshow([[0.5,0.5],[0,0]], cmap=cmap, interpolation='bicubic', extent=plotlim)  
        
        ax.set_ylabel('agreement with requirement')
        # set the x ticks from 1 to 9
        # ax.set_yticks(range(1,10))
        ax.set_yticks([1, 3, 5, 7, 9])
        ax.set_yticklabels(agreement)

        ax.set_xlabel('SLR-tool requirement')
        plt.setp(ax.get_xticklabels(), rotation=90, horizontalalignment='right')

    else:
        fig, ax = plt.subplots(dpi=300, figsize=(1, 30))
        plotlim = plt.xlim(1,9) + plt.ylim(len(data),0)  

        ax.imshow([[1,0],[1,0]], cmap=cmap, interpolation='bicubic', extent=plotlim)  
      
        ax.set_xlabel('Values')

        ax.set_ylabel('Requirement')

    ax.set_title('Tool Assisted Literature Surveys - A Requirements Review')
    ax.boxplot(data, vert=vert, labels=labels)


    # ax.legend(labels, title='Categories', loc='upper left', bbox_to_anchor=(1, 1))

    plt.show()
    path = os.path.join(bnw_data_path, 'Requirements Review.png')
    path = os.path.join(bnw_data_path, 'Requirements Review.svg')
    fig.savefig(path, bbox_inches='tight')

# Example usage:
csv_file_path = 'your_data.csv'

data = []
labels = []

def data_for_box_plot(data, reply:Reply = None):
    for i, score in enumerate(reply.scores):
        res = None
        if score:
            res = int(score[1:])
        if i >= len(data):
            data.append([])
        if res:
            data[i].append(res)
    return data

def labels_for_box_plot(data:List = None, labels:List[str] = None, reply:Reply = None):
    labels = [] if labels is None else labels
    if not labels:
        for i in range(len(data)):
            labels.append(f"R{i+1}")
    return labels

# def is_invalid(reply:Reply):
def is_invalid(reply:Reply):
    # it is unrealistic to expect a minor to have experience on SLR
    if reply.age and int(reply.age) < 18:
        print(f"Reply {reply.response_id} is invalid because age is {reply.age}")
        return True        
    return False


for survey in surveys:
    # labels = survey.labels
    if survey.type == "555283":
        # Requirements survey. all we need is one big graphic.
        data = []
        for reply in survey.replies:
            if is_invalid(reply):
                continue
            data_for_box_plot(data, reply)
        labels = labels_for_box_plot(data=data)
        create_box_plot(data, labels)
        continue
    elif survey.type == "628237":
        continue
    else:
        continue