<a href="https://colab.research.google.com/github/YusufAbdurrahmann/feature-engineering-2/blob/main/feature_engineering_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

# **Data importing**

In [40]:
df = pd.read_csv("./dataset/student_performance_prediction.csv")
df.head()

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,,75.0,Yes,Master,Yes
1,S00002,9.3,95.3,60.6,No,High School,No
2,S00003,13.2,,64.0,No,Associate,No
3,S00004,17.6,76.8,62.4,Yes,Bachelor,No
4,S00005,8.8,89.3,72.7,No,Master,No


# **Data Understanding**

In [41]:
df.shape

(40000, 7)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Student ID                                   40000 non-null  object 
 1   Study Hours per Week                         38005 non-null  float64
 2   Attendance Rate                              38008 non-null  float64
 3   Previous Grades                              38006 non-null  float64
 4   Participation in Extracurricular Activities  38000 non-null  object 
 5   Parent Education Level                       38000 non-null  object 
 6   Passed                                       38000 non-null  object 
dtypes: float64(3), object(4)
memory usage: 2.1+ MB


In [45]:
df.describe()

Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades
count,38005.0,38008.0,38006.0
mean,9.962744,75.276323,65.440107
std,5.031154,20.393418,16.503119
min,-12.3,-14.3,8.3
25%,6.6,61.6,55.1
50%,10.0,75.3,65.2
75%,13.4,88.8,75.2
max,32.4,150.2,200.0


In [46]:
df.isnull().sum()

Unnamed: 0,0
Student ID,0
Study Hours per Week,1995
Attendance Rate,1992
Previous Grades,1994
Participation in Extracurricular Activities,2000
Parent Education Level,2000
Passed,2000


In [47]:
df.duplicated().sum()

np.int64(0)

# **Data cleaning**

In [50]:
df = df.dropna(subset = "Study Hours per Week")

In [51]:
df.isnull().sum()

Unnamed: 0,0
Student ID,0
Study Hours per Week,0
Attendance Rate,1890
Previous Grades,1893
Participation in Extracurricular Activities,1884
Parent Education Level,1898
Passed,1910


In [52]:
for col in df.columns:
  if df[col].dtypes == "object":
    df[col] = df[col].fillna(df[col].mode()[0])
  else:
    df[col] = df[col].fillna(df[col].mean())


In [53]:
df = df.drop_duplicates()

In [54]:
df.sample(3)

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
11604,S11605,7.3,83.9,62.0,Yes,Associate,No
26961,S26962,5.2,103.5,70.6,Yes,High School,No
29123,S29124,10.5,33.7,59.1,No,Bachelor,No


In [35]:
df["Previous Grades"] = pd.to_datetime(df["Previous Grades"])

In [60]:
df["Previous Grades"] = df["Previous Grades"].astype(float)

In [61]:
df

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,75.278015,75.0,Yes,Master,Yes
1,S00002,9.3,95.300000,60.0,No,High School,No
2,S00003,13.2,75.278015,64.0,No,Associate,No
3,S00004,17.6,76.800000,62.0,Yes,Bachelor,No
4,S00005,8.8,89.300000,72.0,No,Master,No
...,...,...,...,...,...,...,...
39995,S39996,15.6,93.800000,51.0,Yes,Master,No
39996,S39997,11.3,66.400000,64.0,No,Doctorate,Yes
39997,S39998,13.1,65.600000,38.0,No,Bachelor,No
39998,S39999,14.1,74.900000,65.0,Yes,Master,No


In [69]:
df.loc[df.duplicated()]

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed


In [74]:
df = df.rename(columns = {"Student ID" : "Student_ID"})

In [75]:
df.sample(3)

Unnamed: 0,Student_ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
2660,S02661,4.1,108.7,45.0,No,Master,Yes
15933,S15934,13.4,73.1,68.0,Yes,High School,Yes
26611,S26612,13.9,41.5,57.0,Yes,Associate,No


In [79]:
df.query('Student_ID == "S00005"')

Unnamed: 0,Student_ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
4,S00005,8.8,89.3,72.0,No,Master,No


In [80]:
df.sample(3)

Unnamed: 0,Student_ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
27927,S27928,1.6,59.1,76.0,Yes,Associate,Yes
7117,S07118,8.2,72.1,73.0,No,Bachelor,Yes
13335,S13336,16.3,65.0,69.0,Yes,High School,Yes


# **Exploratory Data Analysis (EDA)**