In [1]:
import numpy as np
import pandas as pd
import csv, sqlite3
import matplotlib.pyplot as plt


In [2]:
def valid_data(user_log_df, activity_log_df):
    intersection = np.intersect1d(user_log_df.columns, activity_log_df.columns)

    user_int = user_log_df[intersection]
    activity_int = activity_log_df[intersection]

    # if the rows, that are common in both data frames do intersect with different data
    # the data is deemed invalid and processing cannot continue
    return (user_int == activity_int).all(axis=1).all()


In [3]:
def prep_df(dataframes):
    if dataframes is None:
        gui.set_status("No data selected")
        return

    user_log_df = next((f for f in dataframes if "Date" in f.columns), None)
    activity_log_df = next((f for f in dataframes if "Action" in f.columns), None)
    component_df = next((f for f in dataframes if "Code" in f.columns), None)

    if user_log_df is None:
        gui.set_status("User Log Data Not Found")
        return
    if activity_log_df is None:
        gui.set_status("Activity Log Data Not Found")
        return
    if component_df is None:
        gui.set_status("Component Data Not Found")
        return

    if not valid_data():
        gui.set_status("Data is not valid")
        return
    return user_log_df, activity_log_df, component_df

In [4]:
def transformation_remove(user_log_df, activity_log_df, component_df):
    # 1. REMOVE: No outputs should include any data from Component: System, and Folder.
    delete_col_name = '__delete_row'

    # Function to mark rows for deletion based on a condition
    def mark_for_deletion(df, cond):
        df.loc[cond, delete_col_name] = True
        return df

    # Mark rows for deletion
    condition = (lambda df: (df['Component'] == 'System') | (df['Component'] == 'Folder'))

    component_df = mark_for_deletion(component_df, condition(component_df))
    activity_log_df = mark_for_deletion(activity_log_df, condition(activity_log_df))

    # Propagate deletion from activity_log_df to user_log_df
    filtered_indexes = activity_log_df[activity_log_df[delete_col_name] == True].index
    user_log_df.loc[filtered_indexes, delete_col_name] = True

    # Drop marked rows
    def drop_marked_rows(df):
        return df.drop(df[df[delete_col_name] == True].index).drop(columns=[delete_col_name])

    user_log_df = drop_marked_rows(user_log_df)
    activity_log_df = drop_marked_rows(activity_log_df)
    drop_marked_rows(component_df)

    print(len(user_log_df), "<<< user")
    print(len(activity_log_df), "<<< act")
    print(len(component_df), "<<< comp")

    return len(user_log_df) != len(activity_log_df)


In [5]:
def transformation_rename(user_log_df, activity_log_df):
    # 2. RENAME: The column “User Full Name *Anonymized” should be renamed
    # as User_ID both in ACTIVITY_LOG and USER_LOG CSVs.
    col_name_change = {'User Full Name *Anonymized': 'User_ID'}
    activity_log_df.rename(columns=col_name_change, inplace=True)
    user_log_df.rename(columns=col_name_change, inplace=True)

In [6]:
def transformation_merge(user_log_df, activity_log_df):
    # 3. MERGE: Merge the suitable CSVs for analysing user interactions with each component.
    return pd.concat([user_log_df, activity_log_df.drop('User_ID', axis=1)], axis=1)

In [7]:
def transformation_reshape(merge_log_df, col_name, period):
    # 4. RESHAPE: Reshape the data using pivot operation.

    merge_log_df['Date'] = (pd.to_datetime(merge_log_df['Date'], format='%d/%m/%Y %H:%M') +
                            pd.to_timedelta(merge_log_df['Time']))

    # Drop the original 'Date' and 'Time' columns if no longer needed
    merge_log_df = merge_log_df.drop(columns=['Time']).rename(columns={'Date': 'Datetime'})

    merge_log_df[col_name] = merge_log_df['Datetime'].dt.to_period(period)

    pivot_data = merge_log_df.pivot_table(
        index=['User_ID', 'Month'],
        columns='Component',
        values='Action',
        aggfunc='count',
        fill_value=0
    )

    pivot_data.columns = [col for col in pivot_data.columns]

    return pivot_data.reset_index()


In [8]:
def transformation_count(pivot_data, col_name):
    # 5. COUNT: The interactions for each user with the Component for each month.
    pivot_data['Total Interaction'] = pivot_data.loc[:, ~pivot_data.columns.isin(['User_ID', col_name])].sum(axis=1)


In [9]:
def transformation_output_statistics(pivot_data, period):
    #     OUTPUT STATISTICS: Produce the mean, mode and median for the components: Quiz, Lecture, Assignment, Attendance, and Survey.
    # - For each month
    # - For the entire 13-week academic semester
    selected_components = ['User_ID', period, 'Quiz', 'Lecture', 'Assignment', 'Attendence', 'Survey']
    raw_data = pivot_data[selected_components]
    monthly_stats = raw_data[['User_ID', period]].copy()
    data_df = raw_data.loc[:, ~raw_data.columns.isin(['User_ID', period])]
    monthly_stats['Mean'] = data_df.mean(axis=1)
    monthly_stats['Median'] = data_df.median(axis=1)
    monthly_stats['Mode'] = data_df.mode(axis=1).iloc[:, 0]
    return monthly_stats


In [10]:
def transformation_output_correlation(pivot_data):
    correlation_components = ['Assignment', 'Quiz', 'Lecture', 'Book', 'Project', 'Course']
    correlation_data = pivot_data[['User_ID'] + [c for c in correlation_components if c in pivot_data.columns]]
    return correlation_data.corr()

In [11]:
headerDimensions = {"Datetime": 3}


def extract_data(df: pd.DataFrame):
    cols = df.columns
    print(cols)

In [12]:
def import_hook(files):
    print("import files>>>>", files)
    dataframes = []
    try:
        for file in files:
            dataframes.append(pd.read_csv(file))
    except Exception as e:
        gui.set_status("File Read error: " + str(e))
        return

    user_log_df, activity_log_df, components_df = prep_df(dataframes)

    if not transformation_remove(user_log_df, activity_log_df, components_df):
        gui.set_status("Cleaning resulted in incompatible data types")

    transformation_rename(user_log_df, activity_log_df)

    merged_df: pd.DataFrame = transformation_merge(user_log_df, activity_log_df)
    extract_data(merged_df)

    # gui.show_merge_data(None, merged_df)

    # monthly statistics
    month_pivot_df = transformation_reshape(merged_df.copy(), "Month", period='M')
    transformation_count(month_pivot_df, "Month")
    monthly_stats = transformation_output_statistics(month_pivot_df, "Month")

    # total statistics
    year_pivot_df = transformation_reshape(merged_df.copy(), "Year", period='Y')
    year_stats = transformation_output_statistics(month_pivot_df, "Year")

    corr_metrix = transformation_output_correlation(year_pivot_df)


In [None]:

from gui import Gui

gui = Gui(None)

gui.mainloop()
