In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.stats as stats


**Datasource** applied from https://archive.ics.uci.edu/ml/datasets/Student+Performance

In [3]:
url = '../../Datasets/student/student-mat.csv'
students = pd.read_csv(url, sep=';')
students.head(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,10,15,15,15
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,12,12,11
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,6,6,5,6
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,16,18,19
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,14,15,15


<h3><span style="color:red"> Pearson Correlation </span></h3>
<br>
looking for numeric columns then, finding <b>pearson correlation</b> between G3 and numerical attributes

In [4]:
num_students = students.select_dtypes(include = np.number)
num_students

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18,4,4,2,2,0,4,3,4,1,1,3,6,5,6,6
1,17,1,1,1,2,0,5,3,3,1,1,3,4,5,5,6
2,15,1,1,1,2,3,4,3,2,2,3,3,10,7,8,10
3,15,4,2,1,3,0,3,2,2,1,1,5,2,15,14,15
4,16,3,3,1,2,0,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,1,2,2,5,5,4,4,5,4,11,9,9,9
391,17,3,1,2,1,0,2,4,5,3,4,2,3,14,16,16
392,21,1,1,1,1,3,5,5,3,3,3,3,3,10,8,7
393,18,3,2,3,1,0,4,4,1,3,4,5,0,11,12,10


In [5]:
num_students.columns.to_numpy()

array(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures',
       'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
       'absences', 'G1', 'G2', 'G3'], dtype=object)

**We are finding all pearson correlation with G3 attribute**
<br>
Looping all numerical columns and calculating pearson correlation and p-value

In [6]:
column_pearson_corr = []
column_p_value = []
for col in range(num_students.shape[1]):
    pearson_corr = stats.pearsonr(num_students['G3'], num_students[num_students.columns[col]])
    column_pearson_corr.append(pearson_corr[0])
    column_p_value.append(pearson_corr[1])
    
'''print(len(column_pearson_corr))
print(len(column_p_value))
print(len(num_students.columns.to_numpy()))
print(column_pearson_corr)'''

p_corr_df = {'Column Name': num_students.columns.to_numpy(),
             'Pearson Correlation with G3': column_pearson_corr,
             'Pearson P-Value': column_p_value}

pears_df = pd.DataFrame(p_corr_df, columns = ['Column Name', 'Pearson Correlation with G3', 'Pearson P-Value'])

pears_df.sort_values(by = ['Pearson Correlation with G3'], ascending=False)

Unnamed: 0,Column Name,Pearson Correlation with G3,Pearson P-Value
15,G3,1.0,0.0
14,G2,0.904868,7.625719e-148
13,G1,0.801468,9.00143e-90
1,Medu,0.217147,1.336107e-05
2,Fedu,0.152457,0.002379973
4,studytime,0.09782,0.05206115
6,famrel,0.051363,0.308552
12,absences,0.034247,0.4973318
7,freetime,0.011307,0.8227402
10,Walc,-0.051939,0.3031521


<h3><span style="color:red"> Spearman Rank Correlation </span></h3>
<br>
looking for numeric columns then, finding <b>Spearman rank correlation</b> between G3 and numerical attributes

In [7]:
for col in range(num_students.shape[1]):
    pearson_corr = stats.pearsonr(num_students['G3'], num_students[num_students.columns[col]])
    column_pearson_corr.append(pearson_corr[0])
    column_p_value.append(pearson_corr[1])