In [10]:
import os
import pandas as pd
import numpy as np
import scipy.stats as stats
from pandas import DataFrame

import warnings
warnings.filterwarnings("ignore")

#### Define functions

In [11]:
# the pwd assignment is needed for using it via docker.
# absolute path as interpreted by the container has to be specified
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-3'

In [12]:
def run_pearson(col_name: str, x: DataFrame, y: DataFrame) -> None:
    _pearson = stats.pearsonr(x, y)
    print(col_name, _pearson)

#### Read dataset from file

In [13]:
master: DataFrame = pd.read_csv(pwd + '/Chamorro-Premuzic.txt', delimiter='\t', header='infer')
print(master.shape)

(430, 12)


#### Remove empty rows

In [14]:
master_clean:DataFrame = master.replace(' ', np.nan)
master_clean.dropna(inplace=True)
print(master_clean.shape)
print(master_clean.info(verbose=True))

(265, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 140 to 429
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Age       265 non-null    object
 1   Gender    265 non-null    object
 2   studentN  265 non-null    object
 3   studentE  265 non-null    object
 4   studentO  265 non-null    object
 5   studentA  265 non-null    object
 6   studentC  265 non-null    object
 7   lectureN  265 non-null    object
 8   lecturE   265 non-null    object
 9   lecturO   265 non-null    object
 10  lecturA   265 non-null    object
 11  lecturC   265 non-null    object
dtypes: object(12)
memory usage: 26.9+ KB
None


#### Fix datatype -> convert from string to numeric

In [15]:
neaoc_cols = ['studentN', 'studentE', 'studentO', 'studentA', 'studentC', 
              'lectureN', 'lecturE', 'lecturO', 'lecturA', 'lecturC']

for col in neaoc_cols:
    master_clean[col] = pd.to_numeric(master_clean[col])
master_clean['Age'] = pd.to_numeric(master_clean['Age'])

print(master_clean.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265 entries, 140 to 429
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Age       265 non-null    int64 
 1   Gender    265 non-null    object
 2   studentN  265 non-null    int64 
 3   studentE  265 non-null    int64 
 4   studentO  265 non-null    int64 
 5   studentA  265 non-null    int64 
 6   studentC  265 non-null    int64 
 7   lectureN  265 non-null    int64 
 8   lecturE   265 non-null    int64 
 9   lecturO   265 non-null    int64 
 10  lecturA   265 non-null    int64 
 11  lecturC   265 non-null    int64 
dtypes: int64(11), object(1)
memory usage: 26.9+ KB
None


#### Store as pickle file

In [16]:
master_clean.to_pickle(pwd + '/chamorro.pickle')
print('DataFrame stored in pickle file...')

DataFrame stored in pickle file...


#### Run Pearson correlation

In [17]:
master_neaoc: DataFrame = master_clean[neaoc_cols]
corr = master_neaoc.corr(method='pearson', min_periods=1)
print(corr)

          studentN  studentE  studentO  studentA  studentC  lectureN  \
studentN  1.000000 -0.392148 -0.057693  0.042576 -0.228927  0.011883   
studentE -0.392148  1.000000  0.116627  0.045729  0.151575 -0.096558   
studentO -0.057693  0.116627  1.000000 -0.076873 -0.104455 -0.061201   
studentA  0.042576  0.045729 -0.076873  1.000000  0.518161  0.026070   
studentC -0.228927  0.151575 -0.104455  0.518161  1.000000 -0.145901   
lectureN  0.011883 -0.096558 -0.061201  0.026070 -0.145901  1.000000   
lecturE  -0.087296  0.153388  0.067395  0.078499  0.111401  0.024300   
lecturO  -0.040738  0.053201  0.173611  0.072520 -0.022182  0.154833   
lecturA   0.142149 -0.047464 -0.190644  0.143845  0.099064  0.107552   
lecturC   0.052130 -0.009177 -0.082115  0.224247  0.210154 -0.208605   

           lecturE   lecturO   lecturA   lecturC  
studentN -0.087296 -0.040738  0.142149  0.052130  
studentE  0.153388  0.053201 -0.047464 -0.009177  
studentO  0.067395  0.173611 -0.190644 -0.082115  
stu

#### Run Pearson correlation with p-values

In [18]:
run_pearson('studentN <-> lectureN', master_neaoc['studentN'], master_neaoc['lectureN'])
run_pearson('studentE <-> lectureE', master_neaoc['studentE'], master_neaoc['lecturE'])
run_pearson('studentA <-> lectureA', master_neaoc['studentA'], master_neaoc['lecturA'])
run_pearson('studentO <-> lectureO', master_neaoc['studentO'], master_neaoc['lecturO'])
run_pearson('studentC <-> lectureC', master_neaoc['studentC'], master_neaoc['lecturC'])

studentN <-> lectureN (0.011883373654499547, 0.8473195262857787)
studentE <-> lectureE (0.15338769731985202, 0.012420080123099518)
studentA <-> lectureA (0.14384530043469418, 0.019141745433299356)
studentO <-> lectureO (0.17361073541892263, 0.004591568765627554)
studentC <-> lectureC (0.21015383212162062, 0.0005744117789631553)
