<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/pivotetable.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas as pd

# Load data
enroll_train = pd.read_csv('enrollment_train.csv')
log_train = pd.read_csv('log_train.csv')
truth_train = pd.read_csv('truth_train.csv',header=None)
truth_train.columns=['enrollment_id', 'label']
# Merge label with enrollment
enroll_train = enroll_train.merge(truth_train, on='enrollment_id')

# Quick inspection
print(enroll_train.head())
print(log_train.head())


   enrollment_id                          username  \
0              1  9Uee7oEuuMmgPx2IzPfFkWgkHZyPbWr0   
1              3  1qXC7Fjbwp66GPQc6pHLfEuO8WKozxG4   
2              4  FIHlppZyoq8muPbdVxS44gfvceX9zvU7   
3              5  p1Mp7WkVfzUijX0peVQKSHbgd5pXyl4c   
4              6  dpK33RH9yepUAnyoywRwBt1AJzxGlaja   

                          course_id  label  
0  DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila      0  
1  7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx      0  
2  DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila      0  
3  7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx      0  
4  AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd      0  
   enrollment_id                 time   source     event  \
0              1  2014-06-14T09:38:29   server  navigate   
1              1  2014-06-14T09:38:39   server    access   
2              1  2014-06-14T09:38:39   server    access   
3              1  2014-06-14T09:38:48   server    access   
4              1  2014-06-14T09:41:49  browser   problem   

                             object  
0  Oj6

In [58]:
log_train['time'] = pd.to_datetime(log_train['time'])
log_train.sort_values(['enrollment_id', 'time'], inplace=True)
log_train['date'] = log_train['time'].dt.date

ValueError: time data "2014-06-10T" doesn't match format "%Y-%m-%dT%H:%M:%S", at position 3257606. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
print(log_train.isnull().sum())
log_train.time.info()

In [None]:
event_counts = log_train.pivot_table(index='enrollment_id', columns='date',values='event', aggfunc='size', fill_value=0)


In [None]:
event_counts


In [None]:
source_counts = log_train.pivot_table(index='enrollment_id', columns='source', aggfunc='size', fill_value=0)


In [None]:
source_counts

In [None]:
click_freq = log_train.groupby('enrollment_id').size().rename('total_clicks')


In [None]:
click_freq

In [None]:
features = pd.concat([event_counts, source_counts, session_stats['duration'], click_freq], axis=1)
features = features.fillna(0)

# Merge with labels
features['label'] = enroll_train.set_index('enrollment_id')['label']


In [None]:
features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = features.corr()
sns.heatmap(corr, annot=True)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

X = features.drop('label', axis=1)
y = features['label']
selector = SelectKBest(score_func=f_classif, k=20)
X_selected = selector.fit_transform(X, y)
selected_columns = X.columns[selector.get_support()]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=10)
rfe = rfe.fit(X, y)
X_rfe = X.loc[:, rfe.support_]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_val, y_train, y_val = train_test_split(X[selected_columns], y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, preds))
print("AUC:", roc_auc_score(y_val, preds))
