In [29]:
# Jupyter Notebook cell #1: Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    mean_squared_error, r2_score, mean_absolute_error,
    accuracy_score, f1_score, classification_report
)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load your dataset (adjust path as needed)
df_amsterdam = pd.read_csv("Data/amsterdam_weekdays.csv")

# Convert booleans/strings to 0/1 for host_is_superhost, room_private, room_shared
if 'host_is_superhost' in df_amsterdam.columns:
    df_amsterdam['host_is_superhost'] = (
        df_amsterdam['host_is_superhost']
        .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})
        .astype(int)
    )

if 'room_private' in df_amsterdam.columns:
    df_amsterdam['room_private'] = (
        df_amsterdam['room_private']
        .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})
        .astype(int)
    )

if 'room_shared' in df_amsterdam.columns:
    df_amsterdam['room_shared'] = (
        df_amsterdam['room_shared']
        .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})
        .astype(int)
    )

# One-Hot encode 'room_type' if it exists 
#    (handles 'Private room', 'Entire home/apt', 'Shared room', etc.)
if 'room_type' in df_amsterdam.columns:
    df_amsterdam = pd.get_dummies(df_amsterdam, columns=['room_type'], prefix='room_type')

# Force any remaining boolean columns to int (if any were created by get_dummies)
bool_cols = df_amsterdam.select_dtypes(include='bool').columns
for col in bool_cols:
    df_amsterdam[col] = df_amsterdam[col].astype(int)

# Define target column
target_col = "realSum"

# Separate features (X) and target (y)
X_amsterdam = df_amsterdam.drop(columns=[target_col])
y_amsterdam = df_amsterdam[target_col]

# Train/Test split
X_train_amst, X_test_amst, y_train_amst, y_test_amst = train_test_split(
    X_amsterdam, y_amsterdam, test_size=0.2, random_state=42
)

# Baseline Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(X_train_amst, y_train_amst)

# Predictions & Metrics
y_pred_amst = lin_reg.predict(X_test_amst)

r2 = r2_score(y_test_amst, y_pred_amst)
mse = mean_squared_error(y_test_amst, y_pred_amst)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_amst, y_pred_amst)

print("=== Baseline Regression (Amsterdam) ===")
print(f"R²   : {r2:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"MAE  : {mae:.4f}")
print("---------------------------------------\n")

# Save cleaned data to CSV
df_amsterdam.to_csv("Data/amsterdam_weekdays_clean.csv", index=False)



=== Baseline Regression (Amsterdam) ===
R²   : 0.5513
RMSE : 209.1149
MAE  : 153.7245
---------------------------------------

Data saved as 'amsterdam_weekdays_clean.csv'.


  .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})
  .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})
  .replace({False: 0, True: 1, 'FALSE': 0, 'TRUE': 1})


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
student_data = pd.read_csv("Data/StudentPerformanceFactors.csv")

# Create a 'Passed' column from 'Exam_Score' (1 = Pass if >= 60, else 0)
student_data['Passed'] = (student_data['Exam_Score'] >= 60).astype(int)

# Convert boolean/string columns to 0/1
boolean_columns = ['Internet_Access', 'Learning_Disabilities', 'Extracurricular_Activities']
for col in boolean_columns:
    if col in student_data.columns:
        # If column is already boolean dtype
        if student_data[col].dtype == bool:
            student_data[col] = student_data[col].astype(int)  # True -> 1, False -> 0
        else:
            # Convert strings like "True"/"False"/"Yes"/"No" to 1/0
            student_data[col] = (
                student_data[col]
                .astype(str)
                .str.lower()
                .replace({'true': 1, 'false': 0, 'yes': 1, 'no': 0})
                .fillna(0)
                .astype(int)
            )

# Identify other categorical columns for One-Hot Encoding
categorical_cols = [
    'Parental_Involvement', 'Access_to_Resources', 'Motivation_Level', 'Family_Income',
    'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Parental_Education_Level',
    'Distance_from_Home', 'Gender'
]

# Apply One-Hot Encoding to these columns
student_data = pd.get_dummies(student_data, columns=categorical_cols, drop_first=True)

# Force any remaining boolean columns to int (e.g., from get_dummies)
bool_cols = student_data.select_dtypes(include="bool").columns
for col in bool_cols:
    student_data[col] = student_data[col].astype(int)

# Show first 10 rows to verify all are now 0/1 or numeric
print("\nFirst 10 rows after forcing booleans to 0/1:\n")
print(student_data.head(10).to_string())

# Define target column
target_col = "Passed"

# Separate features (X) and target (y)
X_student = student_data.drop(columns=["Exam_Score", target_col])
y_student = student_data[target_col]

# Train/Test split
X_train_student, X_test_student, y_train_student, y_test_student = train_test_split(
    X_student, y_student, test_size=0.2, random_state=42
)

# Train a baseline Logistic Regression model
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_student, y_train_student)

# Make predictions
y_pred_student = log_reg.predict(X_test_student)

# Calculate performance metrics
accuracy = accuracy_score(y_test_student, y_pred_student)
precision = precision_score(y_test_student, y_pred_student)
recall = recall_score(y_test_student, y_pred_student)
f1 = f1_score(y_test_student, y_pred_student)

print("\n=== Baseline Classification (Student Performance) ===")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")
print("---------------------------------------\n")

# Save the processed dataset
student_data.to_csv("Data/StudentPerformanceFactors_clean.csv", index=False)
print("Preprocessing complete. Cleaned dataset saved.")


  .replace({'true': 1, 'false': 0, 'yes': 1, 'no': 0})
  .replace({'true': 1, 'false': 0, 'yes': 1, 'no': 0})
  .replace({'true': 1, 'false': 0, 'yes': 1, 'no': 0})



First 10 rows after forcing booleans to 0/1:

   Hours_Studied  Attendance  Extracurricular_Activities  Sleep_Hours  Previous_Scores  Internet_Access  Tutoring_Sessions  Physical_Activity  Learning_Disabilities  Exam_Score  Passed  Parental_Involvement_Low  Parental_Involvement_Medium  Access_to_Resources_Low  Access_to_Resources_Medium  Motivation_Level_Low  Motivation_Level_Medium  Family_Income_Low  Family_Income_Medium  Teacher_Quality_Low  Teacher_Quality_Medium  School_Type_Public  Peer_Influence_Neutral  Peer_Influence_Positive  Parental_Education_Level_High School  Parental_Education_Level_Postgraduate  Distance_from_Home_Moderate  Distance_from_Home_Near  Gender_Male
0             23          84                           0            7               73                1                  0                  3                      0          67       1                         1                            0                        0                           0                     1