In [3]:
import pandas as pd
from prefixspan import PrefixSpan

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from IPython.display import display
import ipywidgets as widgets
import io


In [4]:
# ======================================
# Load & Preprocess
# ======================================
def load_and_preprocess_data(file_path):

    df = pd.read_csv(file_path)
    df['success_label'] = df['success_label'].astype(int)
    df = df.drop_duplicates()

    df = df.fillna({
        'time_spent_minutes': df['time_spent_minutes'].median() if 'time_spent_minutes' in df.columns else 0,
        'quiz_score': 0,
        'assignment_score': 0,
        'notes_taken': 0
    })

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(by=['student_id', 'timestamp'])

    def create_activity(row):
        activity = []
        activity.append("video" if row.get('video_watched_percent', 0) >= 70 else "skip_video")
        activity.append("notes" if row.get('notes_taken', 0) > 0 else "no_notes")
        activity.append("quiz_pass" if row.get('quiz_score', 0) >= 70 else "quiz_fail")
        return "_".join(activity)

    df['activity'] = df.apply(create_activity, axis=1)

    sequences = (
        df.groupby(['student_id', 'success_label'])['activity']
        .apply(list)
        .reset_index()
    )

    sequences['sequence_length'] = sequences['activity'].apply(len)

    # ---- Outliers Handling (IQR) ----
    Q1 = sequences['sequence_length'].quantile(0.25)
    Q3 = sequences['sequence_length'].quantile(0.75)
    IQR = Q3 - Q1
    sequences['sequence_length'] = sequences['sequence_length'].clip(
        Q1 - 1.5 * IQR,
        Q3 + 1.5 * IQR
    )

    return df, sequences

# ======================================
# PrefixSpan
# ======================================
def run_prefixspan(sequences, min_support=5, top_k=5):

    high_sequences = sequences[sequences['success_label'] == 1]['activity'].tolist()
    low_sequences = sequences[sequences['success_label'] == 0]['activity'].tolist()

    ps_high = PrefixSpan(high_sequences)
    ps_low = PrefixSpan(low_sequences)

    ps_high.minlen = 2
    ps_low.minlen = 2

    high_patterns = sorted(
        ps_high.frequent(min_support),
        key=lambda x: x[0],
        reverse=True
    )[:top_k]

    low_patterns = sorted(
        ps_low.frequent(min_support),
        key=lambda x: x[0],
        reverse=True
    )[:top_k]

    return high_patterns, low_patterns

# ======================================
# Simple GSP
# ======================================
def run_gsp(sequences, min_support=5):

    def is_subsequence(sub, seq):
        it = iter(seq)
        return all(item in it for item in sub)

    all_sequences = sequences['activity'].tolist()
    items = set(item for seq in all_sequences for item in seq)

    patterns = []

    for item in items:
        support = sum(is_subsequence([item], seq) for seq in all_sequences)
        if support >= min_support:
            patterns.append((support, [item]))

    return sorted(patterns, reverse=True)

# ======================================
# Build ML Dataset
# ======================================
def build_ml_dataset(df, sequences):

    features = sequences[['student_id', 'sequence_length']].copy()

    if 'time_spent_minutes' in df.columns:
        avg_time = (
            df.groupby('student_id')['time_spent_minutes']
            .mean()
            .reset_index()
            .rename(columns={'time_spent_minutes': 'avg_time_spent'})
        )
        features = features.merge(avg_time, on='student_id', how='left')

    features = features.fillna(0)

    X = features.drop(columns=['student_id'])
    y = sequences['success_label']

    return X, y

# ======================================
# Train & Evaluate
# ======================================
def train_and_evaluate_models(X, y):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    results = {}

    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10]
    }

    grid = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        scoring='accuracy'
    )
    grid.fit(X_train, y_train)

    best_rf = grid.best_estimator_

    results['Random Forest'] = {
        'accuracy': accuracy_score(y_test, best_rf.predict(X_test)),
        'log_loss': log_loss(y_test, best_rf.predict_proba(X_test)),
        'report_dict': classification_report(y_test, best_rf.predict(X_test), output_dict=True)
    }

    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)

    results['Logistic Regression'] = {
        'accuracy': accuracy_score(y_test, lr.predict(X_test)),
        'log_loss': log_loss(y_test, lr.predict_proba(X_test)),
        'report_dict': classification_report(y_test, lr.predict(X_test), output_dict=True)
    }

    return results


In [5]:
upload_widget = widgets.FileUpload(accept=".csv", multiple=False)
display(upload_widget)


FileUpload(value=(), accept='.csv', description='Upload')

In [None]:
import ipywidgets as widgets
from IPython.display import display
import io

upload_widget = widgets.FileUpload(accept=".csv", multiple=False)
process_button = widgets.Button(description="Load Dataset")

output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        if upload_widget.value:
            file_name = list(upload_widget.value.keys())[0]
            content = upload_widget.value[file_name]['content']
            df, sequences = load_and_preprocess_data(io.BytesIO(content))
            print("Dataset loaded successfully!")
            print("DataFrame shape:", df.shape)
            print("Sequences shape:", sequences.shape)
            
            # تخزين البيانات للاستخدام في خلايا لاحقة
            global loaded_df, loaded_sequences
            loaded_df, loaded_sequences = df, sequences
        else:
            print("Please upload a CSV file.")

process_button.on_click(on_button_click)

display(upload_widget, process_button, output)


In [None]:
df.head()


In [None]:
high_patterns, low_patterns = run_prefixspan(sequences, min_support=5, top_k=5)
gsp_patterns = run_gsp(sequences, min_support=5)

print("Top patterns for high success students:", high_patterns)
print("Top patterns for low success students:", low_patterns)
print("GSP patterns:", gsp_patterns)


In [None]:
X, y = build_ml_dataset(df, sequences)
print("ML dataset prepared!")
print("Features shape:", X.shape)
print("Labels shape:", y.shape)


In [None]:
results = train_and_evaluate_models(X, y)

for model_name, metrics in results.items():
    print(f"\n===== {model_name} =====")
    print("Accuracy:", metrics['accuracy'])
    print("Log Loss:", metrics['log_loss'])
    display(pd.DataFrame(metrics['report_dict']).transpose())


In [None]:
# Distribution of sequence length
plt.figure(figsize=(10,6))
sns.histplot(data=sequences, x='sequence_length', hue='success_label', bins=20, kde=True, palette=['red','green'])
plt.title("Distribution of Sequence Length by Success Label")
plt.xlabel("Sequence Length")
plt.ylabel("Count")
plt.legend(title='Success Label', labels=['Low Success','High Success'])
plt.show()

# Activity counts for high success
high_sequences = sequences[sequences['success_label']==1]['activity'].tolist()
high_flat = [item for seq in high_sequences for item in seq]
high_counts = Counter(high_flat)

plt.figure(figsize=(10,6))
sns.barplot(x=list(high_counts.keys()), y=list(high_counts.values()), palette="Greens_d")
plt.xticks(rotation=45)
plt.title("Activity Counts for High Success Students")
plt.xlabel("Activity")
plt.ylabel("Count")
plt.show()

# Activity counts for low success
low_sequences = sequences[sequences['success_label']==0]['activity'].tolist()
low_flat = [item for seq in low_sequences for item in seq]
low_counts = Counter(low_flat)

plt.figure(figsize=(10,6))
sns.barplot(x=list(low_counts.keys()), y=list(low_counts.values()), palette="Reds_d")
plt.xticks(rotation=45)
plt.title("Activity Counts for Low Success Students")
plt.xlabel("Activity")
plt.ylabel("Count")
plt.show()


In [None]:
# High success
high_labels = ["_".join(p[1]) for p in high_patterns]
high_support = [p[0] for p in high_patterns]

plt.figure(figsize=(10,5))
sns.barplot(x=high_labels, y=high_support, palette="Greens_d")
plt.xticks(rotation=45)
plt.title("Top PrefixSpan Patterns - High Success Students")
plt.ylabel("Support")
plt.show()

# Low success
low_labels = ["_".join(p[1]) for p in low_patterns]
low_support = [p[0] for p in low_patterns]

plt.figure(figsize=(10,5))
sns.barplot(x=low_labels, y=low_support, palette="Reds_d")
plt.xticks(rotation=45)
plt.title("Top PrefixSpan Patterns - Low Success Students")
plt.ylabel("Support")
plt.show()


In [None]:
# Flatten sequences
plotly_data = []
for idx, row in sequences.iterrows():
    student_id = row['student_id']
    for step, activity in enumerate(row['activity'], start=1):
        plotly_data.append({
            'student_id': student_id,
            'step': step,
            'activity': activity,
            'success_label': row['success_label']
        })

plotly_df = pd.DataFrame(plotly_data)
plotly_df['success_color'] = plotly_df['success_label'].map({1: 'High Success', 0: 'Low Success'})


In [None]:
fig = px.scatter(
    plotly_df,
    x='step',
    y='student_id',
    color='success_color',
    symbol='activity',
    hover_data=['student_id','activity','success_label'],
    title="Interactive Student Activity Sequences by Success",
    color_discrete_map={'High Success':'green', 'Low Success':'red'},
    height=600
)
fig.update_traces(marker=dict(size=12), selector=dict(mode='markers'))
fig.update_layout(yaxis=dict(autorange="reversed"))
fig.show()


In [None]:
heatmap_df = plotly_df.pivot_table(index='activity', columns='step', values='success_label', aggfunc='mean', fill_value=0)

fig = px.imshow(
    heatmap_df,
    text_auto=True,
    aspect="auto",
    color_continuous_scale=['red','green'],
    labels=dict(x="Sequence Step", y="Activity", color="Success Rate"),
    title="Interactive Heatmap of Activity Success per Sequence Step"
)
fig.show()
