# Overview 

#### ------------------------------------------------------------------------------------------------------------------------
## Workflow in Backend
### Step 1: Load Initial Datasets
### Step 2: Preprocess
### Step 3: Load the Processed Dataset
### Step 4: Analyse and Visualise

#### ------------------------------------------------------------------------------------------------------------------------
## Note
1) For the non-functional requirements, the prototype requires: 
- Codes for error handling must be implemented in each function.
- Codes for providing feedback against the user actions. 

2) For the technical requirements, the developer uses:
- Core Python from version 3.8.2 within Anaconda using the Jupyter notebook.
- The design where each function is executed step-by-step rather than concurrently.

#### ------------------------------------------------------------------------------------------------------------------------

In [1]:
# Check version
import sys
import platform
# Load CSVs
import pandas as pd 
# Manipulate and Store Data
import sqlite3
# Store Data
import json
# Calculate Correlation Coefficient
import numpy as np
# Display Graphs
import matplotlib.pyplot as plt

In [2]:
# Check the version
# print(sys.version)

3.8.2 (default, Mar 25 2020, 11:22:43) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


# Backend

## Step 1: Load Initial Datasets

In [3]:
# Step 1-1: Loads a CSV file
# Purpose:
# This function loads a CSV file into a Pandas DataFrame, ensuring proper error handling.
# It validates the file path, attempts to read the file, and reports success or failure.
def load_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Step 1-1: File loaded successfully: {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path {file_path}")
    except pd.errors.ParserError:
        print(f"Error: Failed to parse the file at {file_path}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
    return None


# Step 1-2: Validate columns in the dataset
# Purpose:
# This function validates if a given DataFrame contains all the required columns.
# If required columns are missing, it raises an error with details about the missing columns.
def validate_columns(df, expected_columns, file_name):
    missing_columns = expected_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f"Missing columns {missing_columns} in {file_name}")


# Step 1-3: Load and validate all datasets
# Purpose:
# This function loads and validates three datasets (USER_LOG, ACTIVITY_LOG, COMPONENT_CODES).
# It ensures the columns match the expected schema for each dataset.
def load_and_validate_datasets(user_path, activity_path, component_path):
    try:
        # Define the expected columns for each dataset
        expected_columns_user_log = {"Date", "Time", "User Full Name *Anonymized"}
        expected_columns_activity_log = {"User Full Name *Anonymized", "Component", "Action", "Target"}
        expected_columns_component_codes = {"Component", "Code"}

        # Load datasets
        df_user = load_csv(user_path)
        df_activity = load_csv(activity_path)
        df_component = load_csv(component_path)

        # Validate columns
        validate_columns(df_user, expected_columns_user_log, "USER_LOG")
        validate_columns(df_activity, expected_columns_activity_log, "ACTIVITY_LOG")
        validate_columns(df_component, expected_columns_component_codes, "COMPONENT_CODES")

        # Rename "User Full Name" to "User_ID" for consistency
        df_user.rename(columns={"User Full Name *Anonymized": "User_ID"}, inplace=True)
        df_activity.rename(columns={"User Full Name *Anonymized": "User_ID"}, inplace=True)

        print("Step 1: All datasets loaded and validated successfully (RENAME).")
        return df_user, df_activity, df_component

    except Exception as e:
        print(f"Error in Step 1: {str(e)}")
        return None, None, None

In [4]:
# Test: Load the uploaded CSV files
# user_path = "Data/USER_LOG.csv"
# activity_path = "Data/ACTIVITY_LOG.csv"
# component_path = "Data/COMPONENT_CODES.csv"

# Test: Validate the loaded files
# df_user, df_activity, df_component = load_and_validate_datasets(user_path, activity_path, component_path)
# if df_user is not None and df_activity is not None and df_component is not None:
#     print("Datasets are ready for the next step.")
# else:
#     print("Error in loading datasets. Check the logs above.")

Step 1-1: File loaded successfully: Data/USER_LOG.csv
Step 1-1: File loaded successfully: Data/ACTIVITY_LOG.csv
Step 1-1: File loaded successfully: Data/COMPONENT_CODES.csv
Step 1: All datasets loaded and validated successfully (RENAME).
Datasets are ready for the next step.


## Step 2: Preprocess

In [5]:
# Step 2-1: Add the column which is a primary key in Pandas
def add_primary_keys():
    global df_user, df_activity
    df_user['PK'] = range(1, len(df_user) + 1)
    df_activity['PK'] = range(1, len(df_activity) + 1)
    print("Step 2-1: Primary keys added successfully.")

In [6]:
# Test
# add_primary_keys()

In [7]:
# Step 2-2: Extract Month Column in Pandas
def extract_month_column():
    global df_user
    try:
        df_user['Activity_DateTime'] = pd.to_datetime(
            df_user['Date'] + ' ' + df_user['Time'], 
            errors='coerce', 
            dayfirst=True  # dd/mm/yyyy 
        )
        
        invalid_rows = df_user[df_user['Activity_DateTime'].isna()]
        if not invalid_rows.empty:
            print("Dropping invalid rows:", invalid_rows)
            df_user = df_user.dropna(subset=['Activity_DateTime'])

        df_user['Month'] = df_user['Activity_DateTime'].dt.strftime('%B')
        print("Step 2-2: Month column extracted successfully.")
    except Exception as e:
        print(f"Error in Step 2-2: {str(e)}")

In [8]:
# Test
# extract_month_column()

In [9]:
# Test
# df_user.head(30)

In [10]:
# Test 
# df_activity

In [11]:
# Test
# df_component

In [12]:
# Step 2-3: Clean the datasets in Pandas (REMOVE)
def clean_datasets_pandas():
    global df_activity
    try:
        # Remove rows where Component is 'System' or 'Folder'
        df_activity = df_activity[~df_activity['Component'].isin(['System', 'Folder'])]
        print("Step 2-3: Data cleaned successfully in Pandas(REMOVE).")
    except Exception as e:
        print(f"Error in Step 2-3: {str(e)}")

In [13]:
# Test
# clean_datasets_pandas()

In [14]:
# Test
# df_activity

In [15]:
# Step 2-4: Merge datasets in Pandas (MERGE)
def merge_datasets_pandas():
    global df_merged
    try:
        df_merged = pd.merge(
            df_activity,
            df_user[['PK', 'Month']],
            on='PK',
            how='left'
        )
        df_merged = pd.merge(
            df_merged,
            df_component,
            on='Component',
            how='left'
        )
        print("Step 2-4: Datasets merged successfully in Pandas(MERGE).")
    except Exception as e:
        print(f"Error in Step 2-4: {str(e)}")

In [16]:
# Test
# merge_datasets_pandas()

In [17]:
# Test
# df_merged

In [18]:
# Step 2-5: Save merged data in JSON
def save_merged_data_to_json():
    global df_merged
    try:
        # Save merged DataFrame to JSON
        df_merged.to_json("merged_data.json", orient="records", lines=True)
        print("Step 2-5: Merged data saved successfully in JSON format as 'merged_data.json'.")
    except Exception as e:
        print(f"Error in Step 2-5: {str(e)}")

In [19]:
# Test
# save_merged_data_to_json()

In [20]:
# Step 2-6: Reshape data for statistics and save to JSON
def reshape_data_and_save_to_json():
    try:
        # Step 1: Pivot the data to create columns for each Component
        pivot_df = df_merged.pivot_table(
            index=['User_ID', 'Month'],  # Group by User_ID and Month
            columns='Component',        # Create columns for each Component
            values='Action',            # Count occurrences of "Action"
            aggfunc='count',            # Aggregate by count
            fill_value=0                # Fill missing values with 0
        ).reset_index()

        # Step 2: Save monthly statistics to JSON
        pivot_df.to_json("monthly_statistics.json", orient="records", lines=True)

        # Step 3: Generate semester statistics by summing across all months for each user
        semester_stats = pivot_df.groupby(['User_ID']).sum(numeric_only=True)
        semester_stats.reset_index().to_json("semester_statistics.json", orient="records", lines=True)

        # Step 4: Save correlation data
        correlation_columns = ['Assignment', 'Quiz', 'Lecture', 'Book', 'Project', 'Course']
        # Ensure only existing columns are used for correlation data
        available_correlation_columns = [col for col in correlation_columns if col in semester_stats.columns]
        correlation_data = semester_stats[available_correlation_columns]
        correlation_data.reset_index().to_json("correlation_data.json", orient="records", lines=True)

        print("Step 2-6: Data reshaped for statistics and correlation, and saved to JSON files(RESHAPE)(COUNT).")
    except KeyError as e:
        print(f"Error in Step 2-6: Missing column during reshaping: {str(e)}")
    except Exception as e:
        print(f"Unexpected error in Step 2-6: {str(e)}")

In [21]:
# Test
# reshape_data_and_save_to_json()

In [22]:
# Step 2 Call the defined functions to execute Step 2.
# Purpose:
# Preprocess datasets, clean, merge, and save to JSON for storage and further analysis.
def preprocess_and_merge():
    try:
        # Step 2-1 to 2-4: Preprocess using Pandas
        add_primary_keys()          # Add primary keys to the datasets
        extract_month_column()      # Extract month column from user logs
        clean_datasets_pandas()     # Remove unwanted rows and clean the data
        merge_datasets_pandas()     # Merge datasets

        # Step 2-5: Save merged data in JSON
        save_merged_data_to_json()  # Save merged DataFrame to JSON format

        # Step 2-6: Reshape data for statistics and save to JSON
        reshape_data_and_save_to_json()  # Generate statistics and save as JSON

        print("Step 2 Complete: Preprocessing, merging, and JSON saving completed successfully.")
    except Exception as e:
        print(f"Error in Step 2: {str(e)}")

In [23]:
# Test
# preprocess_and_merge()

Step 2-1: Primary keys added successfully.
Step 2-2: Month column extracted successfully.
Step 2-3: Data cleaned successfully in Pandas(REMOVE).
Step 2-4: Datasets merged successfully in Pandas(MERGE).
Step 2-5: Merged data saved successfully in JSON format as 'merged_data.json'.
Step 2-6: Data reshaped for statistics and correlation, and saved to JSON files(RESHAPE)(COUNT).
Step 2 Complete: Preprocessing, merging, and JSON saving completed successfully.


## Step 3-1: Load the Processed Data

In [24]:
# Step 3-1: Load Processed Data from JSON
# Purpose:
# Loads data from a specified JSON file into a Pandas DataFrame.
# Facilitates the retrieval of processed data for analysis or visualization.
def load_data_from_json(file_path):
    """
    Load data from a JSON file into a DataFrame.
    """
    try:
        # Load JSON into a DataFrame
        df = pd.read_json(file_path, orient="records", lines=True)
        print(f"Step 3-1: Data loaded successfully from '{file_path}'.")
        return df
    except FileNotFoundError:
        print(f"Step 3-1: Error - File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Step 3-1: Unexpected error while loading data from '{file_path}': {str(e)}")
        return None

In [25]:
# Test
# Load JSON files
# loaded_df1 = load_data_from_json("monthly_statistics.json")  # Monthly statistics
# loaded_df2 = load_data_from_json("semester_statistics.json")  # Semester statistics
# loaded_interaction_counts_df3 = load_data_from_json("correlation_data.json")  # Interaction counts

Step 3-1: Data loaded successfully from 'monthly_statistics.json'.
Step 3-1: Data loaded successfully from 'semester_statistics.json'.
Step 3-1: Data loaded successfully from 'correlation_data.json'.


In [26]:
# Inspection
# loaded_df1.head()

Unnamed: 0,User_ID,Month,Assignment,Attendence,Book,Course,Feedback,Lecture,Manual,Page,Project,Questionnaire,Quiz,Source,Study_material,Survey,Test,URL
0,1,December,28,0,0,3,0,0,0,1,0,0,0,0,3,0,0,1
1,1,November,56,0,0,30,0,2,4,0,0,0,3,0,7,0,215,0
2,1,October,51,0,1,24,0,38,8,0,0,0,207,0,1,4,0,1
3,1,September,0,1,0,5,0,0,3,0,0,1,0,0,0,0,0,0
4,2,December,22,0,0,3,0,0,0,0,0,0,0,0,1,0,0,1


In [27]:
# Inspection
# loaded_df2.head()

Unnamed: 0,User_ID,Assignment,Attendence,Book,Course,Feedback,Lecture,Manual,Page,Project,Questionnaire,Quiz,Source,Study_material,Survey,Test,URL
0,1,135,1,1,62,0,40,15,1,0,1,210,0,11,4,215,2
1,2,247,0,2,66,0,37,10,0,0,0,148,1,13,4,279,1
2,3,22,0,0,19,0,0,10,0,0,1,125,0,2,4,0,2
3,4,278,5,3,67,0,32,13,0,0,1,129,0,24,9,0,3
4,5,387,1,4,220,0,74,22,2,0,0,200,0,46,9,247,1


In [28]:
# Inspection
# loaded_interaction_counts_df3.head()

Unnamed: 0,User_ID,Assignment,Quiz,Lecture,Book,Project,Course
0,1,135,210,40,1,0,62
1,2,247,148,37,2,0,66
2,3,22,125,0,0,0,19
3,4,278,129,32,3,0,67
4,5,387,200,74,4,0,220


## Step 4: Analyse and Visualise

In [29]:
# Step 4-1 Analyse Monthly Statistics
def calculate_monthly_statistics(df):
    """
    Calculate monthly statistics (Mean, Median, Mode) for each component.
    Returns:
    - DataFrame with columns: ['Component', 'Month', 'Mean', 'Median', 'Mode']
    """
    try:
        components = ['Quiz', 'Lecture', 'Assignment', 'Attendence', 'Survey']
        available_components = [col for col in components if col in df.columns]

        if not available_components:
            raise ValueError("No valid components found for statistics.")

        stats_list = []  # Store stats for all components

        # Iterate over each component and calculate stats
        for component in available_components:
            grouped = df.groupby('Month')[component]
            stats = grouped.agg([
                ('Mean', 'mean'),
                ('Median', 'median'),
                ('Mode', lambda x: x.mode().iloc[0] if not x.mode().empty else 0)
            ])
            stats.reset_index(inplace=True)
            stats['Component'] = component
            stats_list.append(stats)

        # Concatenate results for all components
        monthly_statistics_df = pd.concat(stats_list, ignore_index=True)

        # Clean and reorder columns
        monthly_statistics_df = monthly_statistics_df[['Component', 'Month', 'Mean', 'Median', 'Mode']]
        month_order = ['September', 'October', 'November', 'December']
        monthly_statistics_df['Month'] = pd.Categorical(monthly_statistics_df['Month'], categories=month_order, ordered=True)
        monthly_statistics_df.sort_values(by=['Month', 'Component'], inplace=True)
        monthly_statistics_df['Component'] = monthly_statistics_df['Component'].replace("Attendence", "Attendance")

        print("Monthly statistics calculated successfully.")
        return monthly_statistics_df

    except Exception as e:
        print(f"Error in calculating monthly statistics: {str(e)}")
        return None

In [30]:
# Step 4-2 Analyse Semester Statistics
def calculate_semester_statistics(df):
    """
    Calculate semester-wide statistics (Mean, Median, Mode) for each component.
    Returns:
    - DataFrame with columns: ['Component', 'Mean', 'Median', 'Mode']
    """
    try:
        components = ['Quiz', 'Lecture', 'Assignment', 'Attendence', 'Survey']
        available_components = [col for col in components if col in df.columns]

        if not available_components:
            raise ValueError("No valid components found for statistics.")

        stats = {}
        # Calculate stats for each component
        for component in available_components:
            stats[component] = {
                'Mean': round(df[component].mean(), 1),
                'Median': round(df[component].median(), 1),
                'Mode': round(df[component].mode().iloc[0], 1) if not df[component].mode().empty else 0
            }

        # Convert to DataFrame
        semester_statistics_df = pd.DataFrame(stats).T.reset_index()
        semester_statistics_df.rename(columns={'index': 'Component'}, inplace=True)
        semester_statistics_df['Component'] = semester_statistics_df['Component'].replace("Attendence", "Attendance")
        print("Semester statistics calculated successfully.")
        return semester_statistics_df

    except Exception as e:
        print(f"Error in calculating semester statistics: {str(e)}")
        return None

In [31]:
# Step 4-3:  Display the user interaction for each component
# Purpose:
# Plots a histogram for user interactions for each component.
def plot_user_interaction_histogram(df):
    try:
        components = ['Assignment', 'Quiz', 'Lecture', 'Book', 'Project', 'Course']
        available_components = [col for col in components if col in df.columns]

        if len(available_components) == 0:
            raise ValueError("No components found for histogram plotting.")

        # Plot histogram
        df[available_components].sum().plot(
            kind='bar',
            figsize=(10, 6),
            title="User Interaction Across Components",
            xlabel="Component",
            ylabel="Number of Interactions"
        )

        plt.xticks(rotation=45, fontsize=12)
        plt.yticks(fontsize=12)
        plt.tight_layout()
        plt.show()
        print("Step 4-3: Histogram of user interactions generated successfully.")
    except Exception as e:
        print(f"Step 4-3: Error during histogram plotting: {str(e)}")

In [32]:
# Step 4-4: Scatter plots with regression line and correlation coefficient
# Purpose: 
# - calculate_correlation: Calculate the Pearson correlation coefficient between two variables.
# - calculate_regression_coefficients: Perform linear regression and return coefficients.
# - identify_outliers: Identify outliers based on the threshold of standard deviations.

def calculate_correlation(x, y):
    try:
        return x.corr(y)
    except Exception as e:
        print(f"Error calculating correlation: {str(e)}")
        return None

def calculate_regression_coefficients(x, y):
    try:
        return np.polyfit(x, y, 1)  # Linear regression coefficients
    except Exception as e:
        print(f"Error calculating regression coefficients: {str(e)}")
        return None

def identify_outliers(y, threshold=3):
    try:
        mean = y.mean()
        std_dev = y.std()
        return (y > mean + threshold * std_dev) | (y < mean - threshold * std_dev)
    except Exception as e:
        print(f"Error identifying outliers: {str(e)}")
        return None

# EOF