In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing

In [None]:
def clean_numeric(value):
    if pd.isna(value):
        return np.nan
    
    if isinstance(value, (int, float)):
        return value
    
    value_str = str(value).strip()
    
    if value_str.lower() in ['nan', 'null', '?', '', 'none', 'na']:
        return np.nan
    
    try:
        numeric_match = re.search(r'[-+]?\d*\.?\d+', value_str)
        if numeric_match:
            return float(numeric_match.group())
        else:
            return np.nan
    except:
        return np.nan


def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_count = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    
    if outliers_count > 0:
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

In [None]:
df = pd.read_csv('pass_fail.csv')

In [None]:
df.info()
df.head()
df.describe()
df

## Data Cleaning

In [None]:
cols_to_drop = ['Student ID']
all_cols = ['Study Hours per Week', 'Attendance Rate', 'Previous Grades', "Passed",  "Participation in Extracurricular Activities", "Parent Education Level"]
numeric_cols = ['Study Hours per Week', 'Attendance Rate', 'Previous Grades']

In [None]:
df = df.drop(columns=cols_to_drop)

### Consistency
- Replacing Yes with 1 and No with 0
- Converting Education Level to numeric scale
- Removing missing rows

In [None]:
mode_parent_edu = df['Parent Education Level'].mode()[0]
df['Parent Education Level'] = df['Parent Education Level'].fillna(mode_parent_edu)

education_mapping = {
    'High School': 1,
    'Associate': 2,
    'Bachelor': 3,
    'Master': 4,
    'Doctorate': 5
}
df['Parent Education Level'] = df['Parent Education Level'].map(education_mapping)

df.dropna(subset=['Passed'], inplace=True)
label_encoder = preprocessing.LabelEncoder()
df['Passed'] = label_encoder.fit_transform(df['Passed'])
df.dropna(subset=['Participation in Extracurricular Activities'], inplace=True)
df['Participation in Extracurricular Activities'] = label_encoder.fit_transform(df['Participation in Extracurricular Activities'])

### Removing Duplicates

In [None]:
duplicate = df.duplicated().sum()
if duplicate > 0:
    df = df.drop_duplicates()

### Cleaning Numeric Columns
- Study hours should be positive & not unrealistically high
- Attendance rate should be between 0 and 100
- Previous grades should be between 0 and 100

In [None]:
for col in all_cols:
    df[col] = df[col].apply(clean_numeric)
    
    if col == 'Study Hours per Week':
        df.loc[df[col] < 0, col] = np.nan
        df.loc[df[col] > 80, col] = 80
    elif col == 'Attendance Rate':
        df.loc[df[col] < 0, col] = 0
        df.loc[df[col] > 100, col] = 100
    elif col == 'Previous Grades':
        df.loc[df[col] < 0, col] = 0
        df.loc[df[col] > 100, col] = 100

### Handling Missing Values

In [None]:

numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])


### Handling Outliers

In [None]:
for col in numeric_cols:
    df = handle_outliers(df, col)

### Handling Negative Values

In [None]:
for col in  all_cols:
    neg_values = (df[col] < 0).sum()
    if neg_values > 0:
        df[col] = df[col].abs()

### Type Conversion

In [None]:
for col in all_cols:
    df[col] = df[col].astype(float)

### Removing Duplicates

In [None]:
duplicates = df.duplicated().sum()
if duplicates > 0:
    df = df.drop_duplicates()

## Classification

In [None]:
X = df.drop(columns=['Passed'])
Y = df['Passed']


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)