In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

### Necessary functions

In [None]:
def clean_numeric(value):
    if pd.isna(value):
        return np.nan
    
    if isinstance(value, (int, float)):
        return value
    
    value = str(value).strip()
    
    if value.lower() in ['nan', 'null', '?', '', 'none', 'na']:
        return np.nan
    
    try:
        numeric_match = re.search(r'[-+]?\d*\.?\d+', value)
        if numeric_match:
            return float(numeric_match.group())
        else:
            return np.nan
    except:
        return np.nan


def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    
    if outliers > 0:
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

In [None]:
df = pd.read_csv('performance.csv')

In [None]:
df.info()
df.head()
df.describe()
df

## Data Cleaning

In [None]:
cols_to_drop = ['StudentID']

all_cols = ['Age', 'StudyTimeWeekly', 'Absences', 'ParentalEducation', 'GPA', 'Tutoring', 'ParentalSupport', 'Extracurricular', "Sports", 'Gender', 'Ethnicity', 'Music', 'Volunteering', 'GradeClass']


### Dropping unnecessary columns

In [None]:

df = df.drop(columns=cols_to_drop)

### Removing Duplicate rows

In [None]:
duplicates = df.duplicated().sum()
if duplicates > 0:
    df = df.drop_duplicates()

### Making numeric columns truly numeric

In [None]:
for col in all_cols:
    df[col] = df[col].apply(clean_numeric)

### Handling missing values

 - We will fill missing values in numeric columns with the median of each column.

In [None]:
missing_values = df.isnull().sum()

numeric_imputer = SimpleImputer(strategy='mean')
df[all_cols] = numeric_imputer.fit_transform(df[all_cols])

### Removing outliers, using IQR method

In [None]:
for col in ['Age', 'StudyTimeWeekly', 'Absences', 'GPA']:
    df = handle_outliers(df, col)

### Handling unrealistic combinations
1. Students with very low study time but very high GPA.
2. Students with very high study time but very low GPA.


In [None]:
unrealistic = ((df['StudyTimeWeekly'] < 1) & (df['GPA'] > 3.5)).sum()
if unrealistic > 0:
    mask = (df['StudyTimeWeekly'] < 1) & (df['GPA'] > 3.5)
    df.loc[mask, 'StudyTimeWeekly'] = df['StudyTimeWeekly'].median()

unrealistic = ((df['StudyTimeWeekly'] > 15) & (df['GPA'] < 2.0)).sum()
if unrealistic > 0:
    mask = (df['StudyTimeWeekly'] > 15) & (df['GPA'] < 2.0)
    df.loc[mask, 'StudyTimeWeekly'] = df['StudyTimeWeekly'].median()

### Handling negetive values

In [None]:
for col in  all_cols:
    neg_values = (df[col] < 0).sum()
    if neg_values > 0:
        df[col] = df[col].abs()

### Type conversion

In [None]:
for col in all_cols:
    df[col] = df[col].astype(float)

## Regression

In [None]:
X = df.drop(columns=['GPA'])
Y = df['GPA']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)