In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import openpyxl
import re
from scipy.stats import ttest_ind
from pprint import pprint
from textblob import TextBlob


In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_ = pd.read_excel('FullSpreadsheets\\CAF v0.1_June 24, 2024_16.35 filtered.xlsx', index_col=0)
df_.head()

In [64]:
df = df_.copy()
colId = 'Student ID'
colNameG = 'Student Given Name'
colNameF = 'Student Family Name'
colDate = 'Recorded Date'
df[colDate] = pd.to_datetime(df[colDate]).dt.date
colCohort = 'Cohort' #BOH2 and BOH3
colSubject = 'Subject'
colClinicChoice = 'Simulation or Clinic - Selected Choice'
colRole = 'Role - Selected Choice'
colRKC = 'RKC_MC1_supervisor'
colRKCStudent = 'RKC_MC1_student'
colPatient = 'Patient for session'
rubricQues = ['PS', 'CS', 'TS',	'ES']
colComments = 'Any further comments?'

colSupervisorChoice = 'Supervisor Name - Selected Choice'
colSupervisorOther = 'Supervisor Name - Other - Text'
colSupervisor = 'Supervisor Name'
df[colSupervisor] = df.apply(
    lambda row: row[colSupervisorOther] if row[colSupervisorChoice] == 'Other' else row[colSupervisorChoice], 
    axis=1
)
df[colSupervisor] = df[colSupervisorChoice].str.title()
df.drop([colSupervisorChoice, colSupervisorOther], axis=1, inplace=True)

In [None]:
df = df[[colId, colNameG, colNameF, colDate, colCohort, colSubject, colClinicChoice, colRKCStudent, colPatient, colRKC, colRole, colComments, colSupervisor] + rubricQues]
for col in rubricQues:
    df[col] = df[col].str.extract(r'Lvl (\d+)')[0]
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
# df = df[(df[colClinicChoice] == 'Clinic')| (df[colClinicChoice] == '3')| (df[colClinicChoice] == '6')| (df[colClinicChoice] == '7')]
df = df[df[colClinicChoice]!='Simulation']
df = df[(df[colCohort] == 'BOH2') | (df[colCohort] == 'BOH3')]
df.head()

In [None]:
supervisors = df[colSupervisor].unique()
supervisors

In [None]:
rkcCount = df[colRKC].value_counts()
rkcCount

## See unmatch student v Supervisor

In [None]:
# See how many rkc student don't match the rkc supervisor
# first replace Not Assessed in student's with Not Reviewed
df[colRKCStudent] = df[colRKCStudent].replace('Not Assessed', 'Not Reviewed')
# remove the nan value rows
unmatched = df.dropna(subset=[colRKCStudent])
# finding the rows where the student and supervisor don't match
unmatched = unmatched[unmatched[colRKCStudent] != unmatched[colRKC]]
unmatched = unmatched[unmatched[colRole] == 'Operator']
print(len(unmatched))
unmatched.head()
unmatched.to_csv('FullSpreadsheets/RKC/unmatched.csv')


### See how many times has a student been an operator

In [77]:

operator = df[df[colRole] == 'Operator']
# operator = operator[operator[colPatient] == 'I saw a patient']
operatorboh2 = operator[operator[colCohort] == 'BOH2']
operatorboh3 = operator[operator[colCohort] == 'BOH3']
# get counts for each student id
def getOperatorCount(df):
    operatorCount = df[colId].value_counts()
    operatorCount = pd.DataFrame(operatorCount)
    return operatorCount

operatorCountboh2 = getOperatorCount(operatorboh2)
operatorCountboh3 = getOperatorCount(operatorboh3)
operatorCountboh2.to_excel('FullSpreadsheets/RKC/operator count boh2.xlsx')
operatorCountboh3.to_excel('FullSpreadsheets/RKC/operator count boh3.xlsx')

operator = operator[operator[colPatient] == 'I saw a patient']
operatorboh2 = operator[operator[colCohort] == 'BOH2']
operatorboh3 = operator[operator[colCohort] == 'BOH3']
operatorCountboh2 = getOperatorCount(operatorboh2)
operatorCountboh3 = getOperatorCount(operatorboh3)
operatorCountboh2.to_excel('FullSpreadsheets/RKC/saw a patient boh2.xlsx')
operatorCountboh3.to_excel('FullSpreadsheets/RKC/saw a patient boh3.xlsx')

## Get Not Reviewed

In [None]:
notReviewed = df[df[colRKC]=='Not Reviewed']
# Get counts of role
roleCount = notReviewed[notReviewed[colRole]=='Operator']
roleCount

## Time series of RKC for each student

In [None]:
import os
savepath = 'FullSpreadsheets\\RKC'
def createTimeSeries(df, cohort):
    # Sort by 'Student ID' and 'Recorded Date'
    df = df.sort_values(by=[colId, colDate])

    # Map 'Yes' to 1 and 'No' to 0 in 'RKC_MC1_supervisor'
    colRKCNumber = 'RKC Numeric'
    df[colRKCNumber] = df[colRKC].map({'Yes': 1, 'No': 0, 'Not Reviewed': np.nan})

    # rkcCount = df[colRKC].value_counts()
    # rkcCount = pd.DataFrame(rkcCount)
    # pprint(rkcCount)
    # Plotting the time series for each student
    unique_students = df[colId].unique()

    plt.figure(figsize=(14, 8))
    newpath = os.path.join(savepath, cohort)
    os.makedirs(newpath, exist_ok=True)
    results = []
    for student in unique_students[0:]:

        student_data = df[df[colId] == student]
        name = student_data[colNameG].iloc[0] + ' ' + student_data[colNameF].iloc[0]
        rkcCount = student_data[colRKC].value_counts().reindex(['Yes', 'No', 'Not Reviewed']).fillna(0).astype(int)
        results.append({'Student ID': student, 'Student Name': name, 'Yes': rkcCount['Yes'], 'No': rkcCount['No'], 'Not Reviewed': rkcCount['Not Reviewed']})
        print(f'\n{student}')
        pprint(rkcCount)
        student_data = student_data.dropna(subset=[colRKCNumber])
        student_data = student_data.sort_values(by=colDate)
        if len(student_data) == 0:
            continue
        plt.scatter(student_data[colDate].astype(str), student_data[colRKCNumber], marker='o',
                    color=student_data[colRKC].map({'Yes': 'blue', 'No': 'red'}))
        # plt.plot(student_data[colDate].astype(str), student_data[colRKCNumber], marker='o',
        #             color=student_data[colRKC].map({'Yes': 'blue', 'No': 'red'}))
        # Create dummy scatter plots for legend
        plt.scatter([], [], color='blue', label='Yes')
        plt.scatter([], [], color='red', label='No')
        plt.title(f'Record Keeping Information for - {student} ')
        plt.xlabel('Recorded Date')
        plt.ylabel('RKC (1 = Yes, 0 = No)')
        plt.legend(title='Legend')
        plt.ylim(-0.5, 2)
        plt.yticks([0, 1])
        plt.xticks(student_data[colDate].astype(str), rotation=90)
        plt.grid(False)
        plt.tight_layout()
        plt.savefig(os.path.join(newpath, f'{student}.png'))
        plt.show()
    counts = pd.DataFrame(results)
    return counts

# boh2 = df[df[colCohort] == 'BOH2']
boh3 = df[df[colCohort] == 'BOH3']
# countsboh2=createTimeSeries(boh2, 'BOH2')
countsboh3=createTimeSeries(boh3, 'BOH3')


In [72]:
countsboh2.sort_values(by='No', ascending=False, inplace=True)
countsboh3.sort_values(by='No', ascending=False, inplace=True)
countsboh2.to_excel('FullSpreadsheets\\RKC\\BOH2.xlsx', index=False)
countsboh3.to_excel('FullSpreadsheets\\RKC\\BOH3.xlsx', index=False)

In [None]:

# Filter the data to include only 'Yes' and 'No' in RKC_MC1_supervisor
studentId = df[colId].unique()

# Define specific colors for each category
colors = {
    'Yes': 'green',
    'No': 'red',
    'Not Reviewed': 'gray'
}



def tests(filtered_df, title='', supervisor=None):
    rkcCount = filtered_df[colRKC].value_counts().reindex(['Yes', 'No', 'Not Reviewed']).fillna(0).astype(int)
    # rkcCount = pd.DataFrame(rkcCount)
    pprint(rkcCount)
    # Perform t-tests
    results = {}
    for score in rubricQues:
        yes_scores = filtered_df[filtered_df['RKC_MC1_supervisor'] == 'Yes'][score]
        no_scores = filtered_df[filtered_df['RKC_MC1_supervisor'] == 'No'][score]
        # print(yes_scores, no_scores)
        t_stat, p_val = ttest_ind(yes_scores, no_scores)
        results[score] = {'t_stat': t_stat, 'p_val': p_val}

    results = pd.DataFrame(results).T
    mean_scores = filtered_df.groupby('RKC_MC1_supervisor')[rubricQues].mean()
    pprint(mean_scores.T)
    pprint(results)



    # Add sentiment scores to the DataFrame
    sentimentDf = filtered_df[~filtered_df[colComments].isnull()]
    def get_sentiment(comment):
        blob = TextBlob(comment)
        return blob.sentiment.polarity

    sentimentDf['Sentiment'] = sentimentDf[colComments].apply(get_sentiment)
    meanScore = sentimentDf.groupby('RKC_MC1_supervisor')['Sentiment'].mean()
    # Compare sentiment scores
    yes_sentiment = sentimentDf[sentimentDf['RKC_MC1_supervisor'] == 'Yes']['Sentiment']
    no_sentiment = sentimentDf[sentimentDf['RKC_MC1_supervisor'] == 'No']['Sentiment']
    t_stat, p_val = ttest_ind(yes_sentiment, no_sentiment)

    sentiment_results = {'Sentiment': {'t_stat': t_stat, 'p_val': p_val}}
    print('\nSentiment:')
    pprint(meanScore)
    pprint(sentiment_results)

    plotData = mean_scores.T#.drop('Not Reviewed', axis=1)
    color_list = [colors.get(col, 'blue') for col in plotData.columns]
    # plotData.plot(kind='bar', figsize=(10, 6), width=0.8, color = color_list)
    
    # plt.title(f'Mean Scores by RKC{title}')
    # plt.xlabel('Score Categories')
    # plt.ylabel('Mean Scores')
    # plt.xticks(rotation=0)
    # plt.legend(title='Legend')
    # plt.tight_layout()
    # plt.savefig(f'FullSpreadsheets\\RKC\\Mean Scores by RKC{title}.png')
    # Show the plot
    # plt.show()
    return rkcCount

# for id in studentId:
#     filtered_df = df[df[colId] == id]
#     print(len(filtered_df))
#     filtered_df = filtered_df[filtered_df['RKC_MC1_supervisor'].isin(['Yes', 'No', 'Not Reviewed'])]
#     if len(filtered_df) == 0:
#         continue
#     print(len(filtered_df))
    # tests(filtered_df, f'_{id}')
filtered_df = df[df[colRKC].isin(['Yes', 'No', 'Not Reviewed'])]
tests(filtered_df)

In [None]:
# divide dataframe by supervisor name
filtered_df = df[df[colRKC].isin(['Yes', 'No', 'Not Reviewed'])]
print(filtered_df[colSupervisor].unique())
supervisorDf = {}
for supervisor, data in filtered_df.groupby(colSupervisor):
    supervisorDf[supervisor] = data

# Perform t-tests for each supervisor
supervisorResults = {}
for supervisor, data in supervisorDf.items():
    print(f'\n\nSupervisor: {supervisor}')
    results = tests(data, f' - {supervisor}', supervisor)
    supervisorResults[supervisor] = results

supervisorResults = pd.DataFrame(supervisorResults).T
supervisorResults.to_excel('FullSpreadsheets\\RKC\\Supervisor Results.xlsx')
