In [1]:
# initialization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

enrollment = pd.read_csv('C:\\Users\\Jorda\\KDDCup 2015\\data\\train\\enrollment_train.csv')
logs = pd.read_csv('C:\\Users\\Jorda\\KDDCup 2015\\data\\train\\log_train.csv')
truth = pd.read_csv('C:\\Users\\Jorda\\KDDCup 2015\\data\\train\\truth_train.csv')
dates = pd.read_csv('C:\\Users\\Jorda\\KDDCup 2015\\data\\ObjectData\\date.csv')

data = pd.DataFrame()
data['enrollment_id'] = truth['enrollment_id']

In [2]:
# check columns
print('enrollment ', enrollment.columns)
print('logs ', logs.columns)
print('truth ', truth.columns)
print('dates ', dates.columns)
print('data ', data.columns)

enrollment  Index(['enrollment_id', 'username', 'course_id'], dtype='object')
logs  Index(['enrollment_id', 'time', 'source', 'event', 'object'], dtype='object')
truth  Index(['enrollment_id', 'dropped_out'], dtype='object')
dates  Index(['course_id', 'from', 'to'], dtype='object')
data  Index(['enrollment_id'], dtype='object')


In [3]:
# avg_timestamp (normalized)
logs['time'] = pd.to_datetime(logs['time'], errors='coerce')
logs = logs.dropna(subset=['time'])

dates['from'] = pd.to_datetime(dates['from'], errors='coerce')
dates['to'] = pd.to_datetime(dates['to'], errors='coerce')

avg_timestamp = logs.groupby('enrollment_id')['time'].mean().reset_index()

avg_timestamp.rename(columns={'time': 'avg_timestamp'}, inplace=True)

avg_timestamp = pd.merge(avg_timestamp, enrollment[['enrollment_id', 'course_id']], on='enrollment_id', how='left')

avg_timestamp = pd.merge(avg_timestamp, dates[['course_id', 'from', 'to']], on='course_id', how='left')

avg_timestamp['course_duration'] = (avg_timestamp['to'] - avg_timestamp['from']).dt.total_seconds()
avg_timestamp['time_since_start'] = (avg_timestamp['avg_timestamp'] - avg_timestamp['from']).dt.total_seconds()
avg_timestamp['normalized_avg_timestamp'] = avg_timestamp['time_since_start'] / avg_timestamp['course_duration']

data = pd.merge(data, avg_timestamp[['enrollment_id', 'normalized_avg_timestamp']], left_on='enrollment_id', right_on='enrollment_id', how='left')

print(data)

       enrollment_id  normalized_avg_timestamp
0                  1                  0.633229
1                  4                  0.389164
2                  5                  0.480187
3                  7                  0.404621
4                 13                  0.752272
...              ...                       ...
72390         200888                  0.729606
72391         200895                  0.740215
72392         200897                  0.741361
72393         200901                  0.744485
72394         200904                  0.745877

[72395 rows x 2 columns]


In [4]:
# active_days (normalized)
logs['time'] = pd.to_datetime(logs['time'], errors='coerce')
logs['date'] = logs['time'].dt.date

active_days = logs.groupby('enrollment_id')['date'].nunique().reset_index()
active_days.columns = ['enrollment_id', 'active_days']

enrollment['enrollment_id'] = enrollment['enrollment_id']  # Adjust column name if different

dates['from'] = pd.to_datetime(dates['from'], errors='coerce')
dates['to'] = pd.to_datetime(dates['to'], errors='coerce')
dates['course_duration'] = (dates['to'] - dates['from']).dt.days

enrollment = enrollment.merge(dates[['course_id', 'course_duration']], on='course_id', how='left')

active_days = active_days.merge(enrollment[['enrollment_id', 'course_duration']], on='enrollment_id', how='left')

active_days['normalized_active_days'] = active_days['active_days'] / active_days['course_duration']

data = data.merge(active_days[['enrollment_id', 'normalized_active_days']], left_on='enrollment_id', right_on='enrollment_id', how='left')

print(data)


       enrollment_id  normalized_avg_timestamp  normalized_active_days
0                  1                  0.633229                0.482759
1                  4                  0.389164                0.310345
2                  5                  0.480187                0.379310
3                  7                  0.404621                0.344828
4                 13                  0.752272                0.275862
...              ...                       ...                     ...
72390         200888                  0.729606                0.034483
72391         200895                  0.740215                0.034483
72392         200897                  0.741361                0.034483
72393         200901                  0.744485                0.034483
72394         200904                  0.745877                0.034483

[72395 rows x 3 columns]


In [5]:
# event_count
event_counts = logs.groupby('enrollment_id').size().reset_index(name='event_count')
data = pd.merge(data, event_counts, left_on='enrollment_id', right_on='enrollment_id', how='left')

data

Unnamed: 0,enrollment_id,normalized_avg_timestamp,normalized_active_days,event_count
0,1,0.633229,0.482759,314
1,4,0.389164,0.310345,99
2,5,0.480187,0.379310,633
3,7,0.404621,0.344828,479
4,13,0.752272,0.275862,463
...,...,...,...,...
72390,200888,0.729606,0.034483,11
72391,200895,0.740215,0.034483,1
72392,200897,0.741361,0.034483,1
72393,200901,0.744485,0.034483,8


In [6]:
# individual event types
event_counts = logs.pivot_table(index='enrollment_id', columns='event', aggfunc='size', fill_value=0)
data = data.merge(event_counts, left_on='enrollment_id', right_index=True, how='left')

data

Unnamed: 0,enrollment_id,normalized_avg_timestamp,normalized_active_days,event_count,access,discussion,navigate,page_close,problem,video,wiki
0,1,0.633229,0.482759,314,107,0,25,66,87,29,0
1,4,0.389164,0.310345,99,64,0,15,10,6,4,0
2,5,0.480187,0.379310,633,226,34,30,87,170,86,0
3,7,0.404621,0.344828,479,203,33,20,60,94,69,0
4,13,0.752272,0.275862,463,200,4,15,25,150,69,0
...,...,...,...,...,...,...,...,...,...,...,...
72390,200888,0.729606,0.034483,11,2,1,5,1,0,1,1
72391,200895,0.740215,0.034483,1,0,0,1,0,0,0,0
72392,200897,0.741361,0.034483,1,0,0,1,0,0,0,0
72393,200901,0.744485,0.034483,8,3,0,2,2,0,1,0


In [7]:
# dropped_out && drop enrollment_id from data
data = pd.merge(data, truth[['enrollment_id', 'dropped_out']], left_on='enrollment_id', right_on='enrollment_id', how='left')
data.drop('enrollment_id', axis=1, inplace=True)

data

Unnamed: 0,normalized_avg_timestamp,normalized_active_days,event_count,access,discussion,navigate,page_close,problem,video,wiki,dropped_out
0,0.633229,0.482759,314,107,0,25,66,87,29,0,0
1,0.389164,0.310345,99,64,0,15,10,6,4,0,0
2,0.480187,0.379310,633,226,34,30,87,170,86,0,0
3,0.404621,0.344828,479,203,33,20,60,94,69,0,1
4,0.752272,0.275862,463,200,4,15,25,150,69,0,0
...,...,...,...,...,...,...,...,...,...,...,...
72390,0.729606,0.034483,11,2,1,5,1,0,1,1,1
72391,0.740215,0.034483,1,0,0,1,0,0,0,0,1
72392,0.741361,0.034483,1,0,0,1,0,0,0,0,1
72393,0.744485,0.034483,8,3,0,2,2,0,1,0,1


In [8]:
# logistic regression modeling
# X = data[['normalized_avg_timestamp', 'event_count','access','discussion','navigate','page_close','problem','video','wiki']]  # Features
X = data[['normalized_active_days', 'event_count', 'normalized_avg_timestamp']]  # Features
y = data['dropped_out']  # Target variable

# split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[ 1470  1544]
 [  395 11070]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.49      0.60      3014
           1       0.88      0.97      0.92     11465

    accuracy                           0.87     14479
   macro avg       0.83      0.73      0.76     14479
weighted avg       0.86      0.87      0.85     14479



In [9]:
# correlation coefficient per feature
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['dropped_out'].abs().sort_values(ascending=False)

print(correlation_with_target)


dropped_out                 1.000000
normalized_active_days      0.563802
page_close                  0.492389
access                      0.482125
event_count                 0.477518
navigate                    0.458621
video                       0.421997
problem                     0.328497
discussion                  0.202237
normalized_avg_timestamp    0.201780
wiki                        0.165261
Name: dropped_out, dtype: float64


In [10]:
# support vector machine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear')  # You can choose different kernels like 'poly' or 'rbf'
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# gradient boosting machine
X = data.drop(columns=['dropped_out'])
y = data['dropped_out']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train, y_train)

y_pred = gbm.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
