<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [22]:

# Load the files
enrollment_df = pd.read_csv('enrollment_train.csv')
log_df = pd.read_csv('log_train spliting.csv')
truth_df = pd.read_csv('truth_train.csv', header=None, names=['enrollment_id', 'dropout'])



In [23]:
# Preprocess log file
log_df['time'] = pd.to_datetime(log_df['time'])
min_time_per_enrollment = log_df.groupby('enrollment_id')['time'].min().reset_index().rename(columns={'time': 'start_time'})
log_df = log_df.merge(min_time_per_enrollment, on='enrollment_id')
log_df['day'] = (log_df['time'] - log_df['start_time']).dt.days + 1
log_df.drop(columns=['start_time'], inplace=True)



In [24]:
log_df

Unnamed: 0,enrollment_id,time,source,event,object,day
0,51,2014-06-24 00:26:25,server,access,svhJgT21v1mgUHbZDB1sDNhENCbVJza3,15
1,1309,2014-07-09 16:09:23,server,discussion,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl,26
2,295,2014-07-03 15:44:36,browser,page_close,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl,17
3,1019,2014-06-25 23:03:15,server,navigate,Oj6eQgzrdqBMlaCtaq1IkY6zruSrb71b,19
4,591,2014-07-03 10:18:12,browser,problem,OOcnSY6usL1VMgr8YVRXnwldNEIPpdcj,23
...,...,...,...,...,...,...
45275,1019,2014-07-03 12:59:18,browser,page_close,3T6XwoiMKgol57cm29Rjy8FXVFcIomxl,27
45276,1352,2014-06-13 06:21:17,server,access,LPxTHNUu4a88wyAL6vY8ebFrjaaySuKo,1
45277,692,2014-07-11 01:56:39,server,access,8HaEkId8KHyhJ0pzybn2H5EUUSuzMoOW,22
45278,1423,2014-06-30 15:49:25,server,access,YCs2MV77WBlYakfZZ5oUslaHb0gzWHUC,18


In [25]:
# Create event count per day event day
log_df['count'] = 1
pivot_df = log_df.pivot_table(index=['enrollment_id', 'day'], columns='event', values='count', aggfunc='sum', fill_value=0).reset_index()
pivot_wide_df = pivot_df.pivot_table(index='enrollment_id', columns='day')
pivot_wide_df.columns = [f"{event}_day{day}" for (event, day) in pivot_wide_df.columns]
pivot_wide_df = pivot_wide_df.reset_index()



In [26]:
pivot_wide_df

Unnamed: 0,enrollment_id,access_day1,access_day2,access_day3,access_day4,access_day5,access_day6,access_day7,access_day8,access_day9,...,wiki_day21,wiki_day22,wiki_day23,wiki_day24,wiki_day25,wiki_day26,wiki_day27,wiki_day28,wiki_day29,wiki_day30
0,1,0.0,,,,0.0,,,2.0,,...,0.0,,,,,0.0,0.0,,,
1,3,1.0,,,,,,4.0,,,...,,,0.0,,,,,0.0,,
2,4,2.0,,,,,,,,1.0,...,,,,,,,,,,
3,5,0.0,3.0,4.0,,,,,,,...,,0.0,0.0,,,,,,0.0,0.0
4,6,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,1998,5.0,,,2.0,0.0,,,,,...,,,,0.0,0.0,,,,,
1128,2001,10.0,,1.0,,7.0,0.0,3.0,,1.0,...,,,,,,,,,,
1129,2003,3.0,1.0,,,1.0,0.0,,,,...,0.0,,2.0,,5.0,,,,,
1130,2004,2.0,,0.0,,,,,1.0,0.0,...,,0.0,,,,,,,,


In [27]:
pivot_wide_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132 entries, 0 to 1131
Columns: 211 entries, enrollment_id to wiki_day30
dtypes: float64(210), int64(1)
memory usage: 1.8 MB


In [28]:
# Merge all data
merged_df = enrollment_df.merge(truth_df, on='enrollment_id', how='left')
final_df = merged_df.merge(pivot_wide_df, on='enrollment_id', how='left')



In [29]:
final_df

Unnamed: 0,enrollment_id,username,course_id,dropout,access_day1,access_day2,access_day3,access_day4,access_day5,access_day6,...,wiki_day21,wiki_day22,wiki_day23,wiki_day24,wiki_day25,wiki_day26,wiki_day27,wiki_day28,wiki_day29,wiki_day30
0,1,9Uee7oEuuMmgPx2IzPfFkWgkHZyPbWr0,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,0,0.0,,,,0.0,,...,0.0,,,,,0.0,0.0,,,
1,3,1qXC7Fjbwp66GPQc6pHLfEuO8WKozxG4,7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx,0,1.0,,,,,,...,,,0.0,,,,,0.0,,
2,4,FIHlppZyoq8muPbdVxS44gfvceX9zvU7,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,0,2.0,,,,,,...,,,,,,,,,,
3,5,p1Mp7WkVfzUijX0peVQKSHbgd5pXyl4c,7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx,0,0.0,3.0,4.0,,,,...,,0.0,0.0,,,,,,0.0,0.0
4,6,dpK33RH9yepUAnyoywRwBt1AJzxGlaja,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,0,3.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120537,200898,mAAuxtPoEddxQ2jPmf6WwqHOgdvhqrk2,9zpXzW9zCfU8KGBWkhlsGH8B8czISH4J,1,,,,,,,...,,,,,,,,,,
120538,200900,y6i20DJpIul6LihHwgZcWxbdAap5GcBP,9zpXzW9zCfU8KGBWkhlsGH8B8czISH4J,1,,,,,,,...,,,,,,,,,,
120539,200901,PPEydg4GXh52QanXuUYKbv8ENUSViBbd,9zpXzW9zCfU8KGBWkhlsGH8B8czISH4J,1,,,,,,,...,,,,,,,,,,
120540,200904,7k4xZXZirLFjbh80SpWNqJdTubgGglfv,9zpXzW9zCfU8KGBWkhlsGH8B8czISH4J,1,,,,,,,...,,,,,,,,,,


In [30]:
# Prepare X and y
X = final_df.drop(columns=['enrollment_id', 'username', 'course_id', 'dropout']).fillna(0)
y = final_df['dropout']



In [31]:
X

Unnamed: 0,access_day1,access_day2,access_day3,access_day4,access_day5,access_day6,access_day7,access_day8,access_day9,access_day10,...,wiki_day21,wiki_day22,wiki_day23,wiki_day24,wiki_day25,wiki_day26,wiki_day27,wiki_day28,wiki_day29,wiki_day30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [33]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



In [34]:
# Define and train a simple ANN (MLPClassifier)
model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=100, random_state=42)
model.fit(X_train, y_train)



In [35]:
# Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  124  4937]
 [   36 19012]]
              precision    recall  f1-score   support

           0       0.78      0.02      0.05      5061
           1       0.79      1.00      0.88     19048

    accuracy                           0.79     24109
   macro avg       0.78      0.51      0.47     24109
weighted avg       0.79      0.79      0.71     24109

