In [15]:
import pandas as pd
import numpy as np
from scipy import stats

In [16]:
df = pd.read_csv('student_habits_performance.csv')
df.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [17]:
df.shape

(1000, 16)

In [18]:
column = df.columns
print(f'columns : {column}')

columns : Index(['student_id', 'age', 'gender', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'exam_score'],
      dtype='object')


In [19]:
df.describe()

Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,attendance_percentage,sleep_hours,exercise_frequency,mental_health_rating,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,3.5501,2.5055,1.8197,84.1317,6.4701,3.042,5.438,69.6015
std,2.3081,1.46889,1.172422,1.075118,9.399246,1.226377,2.025423,2.847501,16.888564
min,17.0,0.0,0.0,0.0,56.0,3.2,0.0,1.0,18.4
25%,18.75,2.6,1.7,1.0,78.0,5.6,1.0,3.0,58.475
50%,20.0,3.5,2.5,1.8,84.4,6.5,3.0,5.0,70.5
75%,23.0,4.5,3.3,2.525,91.025,7.3,5.0,8.0,81.325
max,24.0,8.3,7.2,5.4,100.0,10.0,6.0,10.0,100.0


In [20]:
#Search for missing data
for i in range(len(column)):
    print(f"{column[i]}: {df[column[i]].isnull().sum()}")


student_id: 0
age: 0
gender: 0
study_hours_per_day: 0
social_media_hours: 0
netflix_hours: 0
part_time_job: 0
attendance_percentage: 0
sleep_hours: 0
diet_quality: 0
exercise_frequency: 0
parental_education_level: 91
internet_quality: 0
mental_health_rating: 0
extracurricular_participation: 0
exam_score: 0


In [21]:
#Data type of features
for i in range(len(column)):
    print(f"{column[i]}: {df[column[i]].dtype}")

student_id: object
age: int64
gender: object
study_hours_per_day: float64
social_media_hours: float64
netflix_hours: float64
part_time_job: object
attendance_percentage: float64
sleep_hours: float64
diet_quality: object
exercise_frequency: int64
parental_education_level: object
internet_quality: object
mental_health_rating: int64
extracurricular_participation: object
exam_score: float64


In [22]:
for i in range(len(column)):
    print(f"{column[i]}: {df[column[i]].nunique()}")

student_id: 1000
age: 8
gender: 3
study_hours_per_day: 78
social_media_hours: 60
netflix_hours: 51
part_time_job: 2
attendance_percentage: 320
sleep_hours: 68
diet_quality: 3
exercise_frequency: 7
parental_education_level: 3
internet_quality: 3
mental_health_rating: 10
extracurricular_participation: 2
exam_score: 480


In [23]:
for i in range(len(column)):
    name = column[i]
    print(f"{column[i]}: {pd.api.types.is_numeric_dtype(df[name])}")

student_id: False
age: True
gender: False
study_hours_per_day: True
social_media_hours: True
netflix_hours: True
part_time_job: False
attendance_percentage: True
sleep_hours: True
diet_quality: False
exercise_frequency: True
parental_education_level: False
internet_quality: False
mental_health_rating: True
extracurricular_participation: False
exam_score: True


In [24]:
for col in column:
    if pd.api.types.is_numeric_dtype(df[col]) == True and col != "exam_score":
        print(f"corr of {col} = {df.exam_score.corr(df[col])}")

corr of age = -0.00890687186398476
corr of study_hours_per_day = 0.825418509396044
corr of social_media_hours = -0.1667328851086167
corr of netflix_hours = -0.17177923845531573
corr of attendance_percentage = 0.0898356017699274
corr of sleep_hours = 0.12168291063767982
corr of exercise_frequency = 0.16010746437908213
corr of mental_health_rating = 0.32152293065514614


In [25]:
#create dataframe of correlation and P-value 
corr_df = pd.DataFrame(columns =["corr","P_value"])
for col in column:
    if pd.api.types.is_numeric_dtype(df[col]) == True and col != "exam_score":
        corr,P_value = stats.pearsonr(df.exam_score,df[col])
        corr_df.loc[col] = [round(corr,3),round(P_value,3)]


In [26]:
corr_df

Unnamed: 0,corr,P_value
age,-0.009,0.778
study_hours_per_day,0.825,0.0
social_media_hours,-0.167,0.0
netflix_hours,-0.172,0.0
attendance_percentage,0.09,0.004
sleep_hours,0.122,0.0
exercise_frequency,0.16,0.0
mental_health_rating,0.322,0.0
