<a href="https://colab.research.google.com/github/ahmadia89/KYVA_Spring2025/blob/main/notebooks/KYVA_LaTeX_Working_Paper_Spring2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path

# Define the LaTeX code for the updated KYVA whitepaper
latex_code = r"""
\documentclass[conference]{IEEEtran}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{color}
\usepackage{hyperref}
\usepackage{caption}
\usepackage{float}

\definecolor{codegray}{gray}{0.95}
\lstset{
  backgroundcolor=\color{codegray},
  basicstyle=\ttfamily\small,
  breaklines=true,
  postbreak=\mbox{\textcolor{red}{$\hookrightarrow$}\space},
  frame=single
}

\title{Analyzing Synchronous and Asynchronous Attendance Patterns: A KYVA Data Science Project for Azure Transition}

\author{
    \IEEEauthorblockN{Amir Y. Ahmadi}
    \IEEEauthorblockA{
        Kentucky Virtual Academy\\
        a.ahmadi89@gmail.com\\
        \url{https://www.ayahmadi.com}}
}

\begin{document}
\maketitle

\begin{abstract}
This paper presents a detailed data science analysis of student attendance trends at the Kentucky Virtual Academy (KYVA), with emphasis on synchronous versus asynchronous utilization. Following the Microsoft DP-900 certification, this project was built to prepare for advanced data modeling with Microsoft Azure tools such as Synapse Analytics and Azure Data Factory. Key visualizations, Python code, and panel data preparation workflows are included to support data science applications and educational research.
\end{abstract}

\section{Background / Context}
KYVA offers virtual education to high school students across Kentucky. Monitoring synchronous versus asynchronous usage, particularly among students with IEPs, offers insight into equity and engagement in a fully online learning environment. Patterns in lesson completion and minutes logged are core to this analysis.

\section{Data \& Methods}
Data was originally exported from PowerBI dashboards hosted at \texttt{app.powerbi.com} and cleaned using Excel. The final dataset (see Appendix) tracks synchronous/asynchronous minutes, IEP status, and multiple attempts per day and per class period. Data was then analyzed using Google Colab notebooks in Python with \texttt{pandas}, \texttt{matplotlib}, and \texttt{seaborn}. Outputs were saved as PNGs and Parquet/JSON for Azure integration.

\section{Descriptive Visualizations}
Figures display the distribution of synchronous vs asynchronous engagement (by student group and class period), correlations with productivity, and patterns across IEP students.

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image01_KYVA_Attendance_Distribution_Synchronous_Utilization_Ahmadi_Spring2025.png}
  \caption{Synchronous Attendance Distribution}
\end{figure}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image02_KYVA_Attendance_Distribution_Asynchronous_Utilization_Ahmadi_Spring2025.png}
  \caption{Asynchronous Attendance Distribution}
\end{figure}

\section{Findings}
Clear disparities emerged in synchronous usage among students with IEPs. Resource teacher presence correlated with higher engagement and completion. Students attempting multiple lessons per day showed distinct utilization curves, suggesting opportunity for real-time intervention triggers.

\section{Limitations}
Zero-minute entries skewed raw distributions. Filtering strategies were used to mitigate impact. Data was limited to one school term and was not linked to assessment outcomes. Correlation, not causation, is assumed.

\section{Next Steps / Azure Integration}
Data exports will now be:
\begin{itemize}
  \item Uploaded to Azure Blob Storage
  \item Parsed via Azure Data Factory
  \item Queried in Azure Synapse Analytics for panel regression
  \item Shared with Tableau Public dashboards
\end{itemize}

\section{References / Tools Used}
\begin{itemize}
  \item Python, Google Colab
  \item Microsoft Power BI
  \item Excel, Parquet, JSON
  \item Azure Data Factory / Synapse Analytics
  \item Matplotlib, Seaborn, Pandas
\end{itemize}

\appendices
\section{Spring 2025 KYVA Figures}
\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image03_KYVA_Attendance_Sync_Utilization_Utilization_by_IEP_Status_Ahmadi_Spring2025.png}
  \caption{Synchronous by IEP Status}
\end{figure}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image04_KYVA_Attendance_Correlation_Matrix_Productivity_Engagement_Ahmadi_Spring2025.png}
  \caption{Productivity Correlation Matrix}
\end{figure}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image05_KYVA_Attendance_Heatmap_All_Students_Lesson_Completion_Ahmadi_Spring2025.png}
  \caption{Lesson Completion Heatmap}
\end{figure}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image06_KYVA_Attendance_Productivity_Resource_Teacher_Presence_Ahmadi_Spring2025.png}
  \caption{Productivity \& RT Presence}
\end{figure}

\begin{figure}[H]
  \centering
  \includegraphics[width=\linewidth]{image07_KYVA_Attendance_Multiple_Attempts_IEP_Status_Ahmadi_Spring2025.png}
  \caption{Multiple Attempts vs. IEP}
\end{figure}

\section{Python Code Snippet (Clean + Export)}
\begin{lstlisting}[language=Python]
df = pd.read_excel("Updated_Dataset_with_MULTIPLE_ATTEMPTS_FLAG_8-5-25.xlsx")
df_filtered = df[(df['Sync_Minutes'] > 0) | (df['Async_Minutes'] > 0)]
df_filtered.to_parquet("KYVA_CLEANED.parquet")
df_filtered.to_json("KYVA_CLEANED.json")
df_filtered.to_csv("KYVA_CLEANED.csv")
\end{lstlisting}

\section{Panel Regression Prep Code}
\begin{lstlisting}[language=Python]
panel_df = df_filtered.copy()
panel_df['date'] = pd.to_datetime(panel_df['Date'])
panel_df.set_index(['Student_ID', 'date'], inplace=True)
panel_df['Log_Sync'] = np.log1p(panel_df['Sync_Minutes'])
panel_df['IEP_Flag'] = panel_df['IEP_Status'].astype(int)
\end{lstlisting}

\end{document}

SyntaxError: incomplete input (ipython-input-1057068267.py, line 4)

In [None]:
file_path = '/content/drive/My Drive/Updated_Dataset_with_MULTIPLE_ATTEMPTS_FLAG_8-5-25.xlsx'
df = pd.read_csv(file_path)
df.columns.tolist()


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Updated_Dataset_with_MULTIPLE_ATTEMPTS_FLAG_8-5-25.xlsx'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Save files to local Colab runtime
fig.savefig("sync_utilization_no_zeros.png")

# Save CSV
df.to_csv("kyva_cleaned.csv", index=False)

NameError: name 'fig' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Sync Utilization without Zeros
plt.figure(figsize=(10, 6))         # ← creates 'fig' implicitly
sns.histplot(df[df['SYNC_UTILIZATION_PCT'] > 0]['SYNC_UTILIZATION_PCT'], kde=True, bins=30)
plt.title("Sync Utilization (Excluding Zeros)")

# Now explicitly get the current figure and save
fig = plt.gcf()                     # ← Get current figure
fig.savefig("sync_utilization_no_zeros.png")
plt.show()


NameError: name 'df' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
file_path = '/content/drive/My Drive/Updated_Dataset_with_MULTIPLE_ATTEMPTS_FLAG_8-5-25.xlsx'
df = pd.read_csv(file_path)
df.columns.tolist()


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte