<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/model(1)_claude.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def organize_data_for_model1():
    """
    Organize the data files according to Model 1 structure:
    210 features model (LSTM or ANN) with shape (1,210)

    Based on the document, we have:
    - 7 events: access, problem, wiki, discussion, navigate, page_close, video
    - 30 days of data per student
    - 7 events × 30 days = 210 features per student
    """

    # Load the data files
    enrollment_df = pd.read_csv('enrollment_train.csv')
    log_df = pd.read_csv('log_train.csv')
    truth_df = pd.read_csv('truth_train.csv', header=None, names=['enrollment_id', 'dropout'])

    print("Data loaded successfully:")
    print(f"Enrollment records: {len(enrollment_df)}")
    print(f"Log records: {len(log_df)}")
    print(f"Truth records: {len(truth_df)}")

    # Convert time to datetime
    log_df['time'] = pd.to_datetime(log_df['time'])

    # Get course start date for each enrollment
    course_start_dates = log_df.groupby('enrollment_id')['time'].min().reset_index()
    course_start_dates.columns = ['enrollment_id', 'course_start_date']

    # Define the 7 main events from the document
    main_events = ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']

    # Create feature matrix: 210 features (7 events × 30 days)
    feature_matrix = []
    enrollment_ids = []

    for enrollment_id in enrollment_df['enrollment_id'].unique():
        # Get student's logs
        student_logs = log_df[log_df['enrollment_id'] == enrollment_id].copy()

        if len(student_logs) == 0:
            continue

        # Get course start date for this student
        start_date = student_logs['time'].min()

        # Create 30-day feature vector for this student
        features = []

        for day in range(1, 31):  # Days 1 to 30
            day_date = start_date + timedelta(days=day-1)
            day_end = day_date + timedelta(days=1)

            # Get logs for this specific day
            day_logs = student_logs[
                (student_logs['time'] >= day_date) &
                (student_logs['time'] < day_end)
            ]

            # Count events for each of the 7 main events on this day
            for event in main_events:
                event_count = len(day_logs[day_logs['event'] == event])
                features.append(event_count)

        feature_matrix.append(features)
        enrollment_ids.append(enrollment_id)

    # Convert to numpy array
    feature_matrix = np.array(feature_matrix)

    # Create column names
    column_names = []
    for day in range(1, 31):
        for event in main_events:
            column_names.append(f"day_{day}_{event}")

    # Create DataFrame
    model1_df = pd.DataFrame(feature_matrix, columns=column_names)
    model1_df['enrollment_id'] = enrollment_ids

    # Merge with enrollment and truth data
    model1_df = model1_df.merge(enrollment_df, on='enrollment_id', how='left')
    model1_df = model1_df.merge(truth_df, on='enrollment_id', how='left')

    print(f"\nModel 1 Data Shape: {model1_df.shape}")
    print(f"Features per student: {len(column_names)}")
    print(f"Total students: {len(model1_df)}")

    return model1_df

def create_model1_summary_table():
    """
    Create a summary table showing the structure of Model 1 data
    """
    # Define the 7 main events
    main_events = ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']

    # Create summary table
    summary_data = []
    for day in range(1, 31):
        row = [f"Day {day}"]
        for event in main_events:
            row.append(f"day_{day}_{event}")
        summary_data.append(row)

    summary_df = pd.DataFrame(summary_data, columns=['Day'] + main_events)

    print("Model 1 Feature Structure (210 features):")
    print("=" * 80)
    print(summary_df.head(10))
    print("...")
    print(summary_df.tail(5))

    return summary_df

def save_model1_data():
    """
    Save the organized data for Model 1
    """
    # Organize the data
    model1_data = organize_data_for_model1()

    # Save to CSV
    model1_data.to_csv('model1_210_features.csv', index=False)

    # Create and save summary
    summary = create_model1_summary_table()
    summary.to_csv('model1_feature_structure.csv', index=False)

    print("\nFiles saved:")
    print("- model1_210_features.csv: Main dataset with 210 features")
    print("- model1_feature_structure.csv: Feature structure reference")

    return model1_data

# Example usage
if __name__ == "__main__":
    # Run the organization
    model1_data = save_model1_data()

    # Display basic statistics
    print("\nBasic Statistics:")
    print(f"Total students: {len(model1_data)}")
    print(f"Dropout rate: {model1_data['dropout'].mean():.2%}")
    print(f"Features per student: 210 (7 events × 30 days)")

    # Show sample of the organized data
    print("\nSample of organized data:")
    feature_cols = [col for col in model1_data.columns if col.startswith('day_')]
    print(model1_data[['enrollment_id'] + feature_cols[:10] + ['dropout']].head())