# Anomaly Detection

In [2]:
#install dask for distributed computing
!pip install -q dask[complete] scikit-learn xgboost pandas

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Reading the data

In [3]:
import pandas as pd
import dask.dataframe as dd

# Set your Google Drive paths here
info_path = '/content/drive/MyDrive/Anomaly Data/studentInfo.csv'
assess_path = '/content/drive/MyDrive/Anomaly Data/studentAssessment.csv'
vle_path = '/content/drive/MyDrive/Anomaly Data/studentVle.csv'

# Load smaller datasets with pandas
student_info = pd.read_csv(info_path)
student_assessment = pd.read_csv(assess_path)

# Load large file using Dask (for parallel/distributed processing)
student_vle = dd.read_csv(vle_path)

student_info.head()


Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


## Distributed Aggregation of Clicks using Dask

In [4]:
# Aggregate total clicks per student using Dask (parallel computation)
vle_agg = student_vle.groupby("id_student")["sum_click"].sum().reset_index()
vle_agg = vle_agg.rename(columns={"sum_click": "total_clicks"})

# Convert from Dask to Pandas (triggers distributed computation)
vle_agg_df = vle_agg.compute()
vle_agg_df.head()

Unnamed: 0,id_student,total_clicks
0,6516,2791
1,11391,934
2,23629,161
3,23798,590
4,24734,499


## Merge Datasets

In [5]:
# Get average assessment score per student
assessment_avg = student_assessment.groupby("id_student")["score"].mean().reset_index()

# Merge all data: info + total_clicks + average_score
data = pd.merge(student_info, vle_agg_df, on="id_student", how="left")
data = pd.merge(data, assessment_avg, on="id_student", how="left")
data.head()


Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,total_clicks,score
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,934.0,82.0
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,1435.0,66.4
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,281.0,
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,2158.0,76.0
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,1034.0,54.4


## Feature Engineering and Target Creation

In [6]:
# Fill missing total_clicks with 0
data["total_clicks"] = data["total_clicks"].fillna(0)

# Create binary label: 1 = Pass, 0 = Fail or Withdrawn
data["label"] = data["final_result"].apply(lambda x: 1 if x == "Pass" else 0)


## Encode Categorical Features

In [7]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ["gender", "region", "highest_education", "imd_band", "age_band", "disability"]
encoders = {col: LabelEncoder().fit(data[col]) for col in categorical_cols}

# Apply encoders
for col, encoder in encoders.items():
    data[col] = encoder.transform(data[col])


## Prepare Features for Modeling

In [8]:
# Define input features
features = categorical_cols + ["num_of_prev_attempts", "studied_credits", "total_clicks", "score"]

# Drop rows with missing data
model_data = data[features + ["label"]].dropna()

# Split X and y
X = model_data[features]
y = model_data["label"]


## Training and Evaluate XGBoost Classifier

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use DMatrix for efficient training
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predict
y_pred_prob = bst.predict(dtest)
y_pred = [1 if p > 0.5 else 0 for p in y_pred_prob]

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.73      0.69      0.71      4335
           1       0.66      0.71      0.68      3684

    accuracy                           0.70      8019
   macro avg       0.70      0.70      0.70      8019
weighted avg       0.70      0.70      0.70      8019

