In [None]:
import pandas as pd

# Load the dataset
visits = pd.read_csv('visits.csv')

In [None]:
# Check the structure
print(visits.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   patient_id                316 non-null    object
 1   site_id                   316 non-null    object
 2   visit_name                316 non-null    object
 3   scheduled_date            316 non-null    object
 4   actual_date               278 non-null    object
 5   visit_status              316 non-null    object
 6   medication_adherence_pct  316 non-null    int64 
 7   diary_submitted           316 non-null    object
dtypes: int64(1), object(7)
memory usage: 19.9+ KB
None


In [None]:
# Aggregate at patient level
patient_data = visits.groupby('patient_id').agg({
    'visit_status': lambda x: (x == 'Missed').sum(),
    'medication_adherence_pct': 'mean',
    'patient_id': 'count'  # total visits
}).rename(columns={'visit_status': 'missed_visits', 'patient_id': 'total_visits'})

In [None]:
# Create dropout label
patient_data['dropout'] = (patient_data['missed_visits'] > 2).astype(int)

# Show data
print(patient_data.head())

            missed_visits  medication_adherence_pct  total_visits  dropout
patient_id                                                                
PAT0001                 0                     61.25             4        0
PAT0002                 0                     84.25             4        0
PAT0005                 1                     60.00             4        0
PAT0006                 0                     66.25             4        0
PAT0008                 1                     62.25             4        0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Assuming patient_data is your aggregated dataframe
# Create dropout label: patients who missed at least 1 visit are labeled dropout (1)
patient_data['dropout'] = (patient_data['missed_visits'] > 0).astype(int)


In [None]:
# Check class distribution
print(patient_data['dropout'].value_counts())


dropout
0    45
1    34
Name: count, dtype: int64


In [None]:
# Define features and target
X = patient_data[['missed_visits', 'medication_adherence_pct', 'total_visits']]
y = patient_data['dropout']

In [None]:
# Split data into train/test (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Initialize and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
# Predict on test data
y_pred = model.predict(X_test)


In [None]:
# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        10

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24


Confusion Matrix:
 [[14  0]
 [ 0 10]]


In [None]:
import pandas as pd

In [None]:
# Ensure visit dates are datetime
visits['actual_date'] = pd.to_datetime(visits['actual_date'])

In [None]:
# Max gap between consecutive visits per patient
visits_sorted = visits.sort_values(['patient_id', 'actual_date'])
visits_sorted['prev_date'] = visits_sorted.groupby('patient_id')['actual_date'].shift(1)
visits_sorted['gap_days'] = (visits_sorted['actual_date'] - visits_sorted['prev_date']).dt.days
max_gap = visits_sorted.groupby('patient_id')['gap_days'].max().fillna(0)


In [None]:
# Completed visit ratio per patient
completed_visits = visits.groupby('patient_id').apply(lambda x: (x['visit_status'] == 'Completed').sum())
total_visits = visits.groupby('patient_id').size()
completed_ratio = completed_visits / total_visits

  completed_visits = visits.groupby('patient_id').apply(lambda x: (x['visit_status'] == 'Completed').sum())


In [None]:
# Add features to patient_data
patient_data = patient_data.join(max_gap.rename('max_gap_days'))
patient_data = patient_data.join(completed_ratio.rename('completed_visit_ratio'))

# View updated patient data
print(patient_data.head())

            missed_visits  medication_adherence_pct  total_visits  dropout  \
patient_id                                                                   
PAT0001                 0                     61.25             4        0   
PAT0002                 0                     84.25             4        0   
PAT0005                 1                     60.00             4        1   
PAT0006                 0                     66.25             4        0   
PAT0008                 1                     62.25             4        1   

            max_gap_days  completed_visit_ratio  
patient_id                                       
PAT0001              7.0                   0.75  
PAT0002              9.0                   1.00  
PAT0005             10.0                   0.75  
PAT0006              8.0                   0.75  
PAT0008             15.0                   0.75  


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Features including new engineered features
X = patient_data[['missed_visits', 'medication_adherence_pct', 'total_visits', 'max_gap_days', 'completed_visit_ratio']]
y = patient_data['dropout']


In [None]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Initialize, train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)


In [None]:
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        10

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24


Confusion Matrix:
 [[14  0]
 [ 0 10]]


In [None]:
patient_data = patient_data.reset_index()
patient_data['dropout_prediction'] = model.predict(X)  # X is the feature DataFrame used for prediction
patient_data[['patient_id', 'dropout_prediction']].to_csv('dropout_predictions.csv', index=False)


In [None]:
from google.colab import files
files.download('dropout_predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>