In [456]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

### Necessary functions

In [457]:
def clean_numeric(value):
    if pd.isna(value):
        return np.nan
    
    if isinstance(value, (int, float)):
        return value
    
    value = str(value).strip()
    
    if value.lower() in ['nan', 'null', '?', '', 'none', 'na', 'n/a', 'zero', 'empty', 'missing', '-', 'blank', '_', 'nothing']:
        return np.nan
    
    try:
        numeric_match = re.search(r'[-+]?\d*\.?\d+', value)
        if numeric_match:
            return float(numeric_match.group())
        else:
            return np.nan
    except:
        return np.nan


def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    
    if outliers > 0:
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

In [458]:
df = pd.read_csv('performance.csv')

In [459]:
df.info()
df.head()
df.describe()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2397 entries, 0 to 2396
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2397 non-null   int64  
 1   Age                2397 non-null   int64  
 2   Gender             2397 non-null   int64  
 3   Ethnicity          2391 non-null   float64
 4   ParentalEducation  2397 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   object 
 6   Absences           2397 non-null   int64  
 7   Tutoring           2397 non-null   int64  
 8   ParentalSupport    2396 non-null   float64
 9   Extracurricular    2387 non-null   object 
 10  Sports             2397 non-null   int64  
 11  Music              2397 non-null   int64  
 12  Volunteering       2397 non-null   int64  
 13  GPA                2397 non-null   object 
 14  GradeClass         2397 non-null   int64  
dtypes: float64(2), int64(10), object(3)
memory usage: 281.0+ KB


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0.0,2,19.8337228078547,7,1,2.0,0,0,1,0,2.92919559166768,2
1,1002,18,0,0.0,1,15.4087560558467,0,0,1.0,0,0,0,0,3.04291483343638,1
2,1003,15,0,2.0,3,4.21056976881226,26,0,2.0,0,0,0,0,0.112602254466182,4
3,1004,17,1,0.0,3,10.0288294739582,14,0,3.0,1,0,0,0,2.05421813970295,3
4,1005,17,1,0.0,2,"""4.67249527297133""",17,1,3.0,0,0,0,0,1.28806118179539,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2392,3388,18,1,0.0,3,10.6805546076944,2,0,4.0,1,0,0,0,3.45550941103439,0
2393,3389,17,0,0.0,1,7.58321727959887,4,1,4.0,0,1,0,0,3.27914972702503,4
2394,3390,16,1,0.0,2,6.80549964468062,20,0,2.0,0,0,0,1,1.14233287952616,2
2395,3391,16,1,1.0,0,12.4166526554839,17,0,2.0,0,1,1,0,1.80329676262921,1


## Data Cleaning

In [460]:
cols_to_drop = ['StudentID','GradeClass']

all_cols = ['Age', 'StudyTimeWeekly', 'ParentalEducation', 'GPA', 'Tutoring', 'ParentalSupport', 'Extracurricular', "Sports", 'Gender', 'Ethnicity', 'Music', 'Volunteering', 'Absences']


### Dropping unnecessary columns

In [461]:

df = df.drop(columns=cols_to_drop)

### Removing Duplicate rows

In [462]:
duplicates = df.duplicated().sum()
if duplicates > 0:
    df = df.drop_duplicates()

### Making numeric columns truly numeric

In [463]:
for col in all_cols:
    df[col] = df[col].apply(clean_numeric)

### Handling missing values

 - We will fill missing values in numeric columns with the mean of each column.

In [464]:
missing_values = df.isnull().sum()

numeric_imputer = SimpleImputer(strategy='mean')
df[all_cols] = numeric_imputer.fit_transform(df[all_cols])

### Removing outliers, using IQR method

In [465]:
for col in ['Age', 'StudyTimeWeekly', 'Absences', 'GPA']:
    df = handle_outliers(df, col)

### Handling unrealistic combinations
1. Students with very low study time but very high GPA.
2. Students with very high study time but very low GPA.


In [466]:
unrealistic = ((df['StudyTimeWeekly'] < 1) & (df['GPA'] > 3.5)).sum()
if unrealistic > 0:
    mask = (df['StudyTimeWeekly'] < 1) & (df['GPA'] > 3.5)
    df.loc[mask, 'StudyTimeWeekly'] = df['StudyTimeWeekly'].median()

unrealistic = ((df['StudyTimeWeekly'] > 15) & (df['GPA'] < 2.0)).sum()
if unrealistic > 0:
    mask = (df['StudyTimeWeekly'] > 15) & (df['GPA'] < 2.0)
    df.loc[mask, 'StudyTimeWeekly'] = df['StudyTimeWeekly'].median()

### Handling negetive values

In [467]:
for col in  all_cols:
    neg_values = (df[col] < 0).sum()
    if neg_values > 0:
        df[col] = df[col].abs()

### Type conversion

In [468]:
for col in all_cols:
    df[col] = df[col].astype(float)

In [469]:
X = df.drop(columns=['GPA'])
Y = df['GPA']

In [470]:
corr_matrix = X.corr()
upper= corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
print(upper)

                   Age    Gender  Ethnicity  ParentalEducation  \
Age                NaN  0.044895   0.008164           0.025099   
Gender             NaN       NaN  -0.020117           0.006771   
Ethnicity          NaN       NaN        NaN          -0.013650   
ParentalEducation  NaN       NaN        NaN                NaN   
StudyTimeWeekly    NaN       NaN        NaN                NaN   
Absences           NaN       NaN        NaN                NaN   
Tutoring           NaN       NaN        NaN                NaN   
ParentalSupport    NaN       NaN        NaN                NaN   
Extracurricular    NaN       NaN        NaN                NaN   
Sports             NaN       NaN        NaN                NaN   
Music              NaN       NaN        NaN                NaN   
Volunteering       NaN       NaN        NaN                NaN   

                   StudyTimeWeekly  Absences  Tutoring  ParentalSupport  \
Age                      -0.011755 -0.011511 -0.028824         0.0

In [None]:
Y_original = df["GPA"].astype(float)

#  Label encode GPA only for ANOVA
le = LabelEncoder()
Y_encoded = le.fit_transform(df["GPA"])

# Select best 8 features
anova_selector = SelectKBest(score_func=f_classif, k=8)
anova_selector.fit(X, Y_encoded)

selected_features = X.columns[anova_selector.get_support()]
print("Selected features:", selected_features.tolist())

# SUse only selected features for regression
X = df[selected_features]

# Use REAL GPA for regression
Y = Y_original


Selected features: ['Ethnicity', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Volunteering']


In [472]:
# PCA for dimentionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df_pca['Target'] = Y
df_pca


Unnamed: 0,PC1,PC2,Target
0,-0.574531,-1.257985,2.929196
1,-0.549505,-1.220667,3.042915
2,0.955564,-0.255944,0.112602
3,-0.597565,-1.349789,2.054218
4,-0.607480,-1.393112,1.288061
...,...,...,...
2387,-0.554171,-1.257433,1.191929
2388,-1.163670,-0.486621,1.810038
2389,-0.619108,-1.406343,2.539113
2390,-0.414142,0.038875,1.395631


## Regression

In [473]:
# X = df.drop(columns=['GPA'])
# Y = df['GPA']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9274183137649517