In [115]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, ttest_ind, chi2_contingency


In [116]:
df = pd.read_csv('/Users/aran/Desktop/Semester 1/Introduction to machine learning/assignments/assignment 8/collegeData.csv')
df.head()

Unnamed: 0,SexCode,MaritalCode,PrevEdCode,DDVeteran,DaysEnrollToStart,AgeAtStart,AgeAtGrad,GPA,MinutesAttended,HoursAttempt,HoursEarned,HoursReq,MinutesAbsent,TransferCredits,TransferGPA,MinEFC,MaxENTEntranceScore,gradFlag
0,M,M,BACH,0,55,24,27,3.22,145953,2925.0,2550.0,2565,3475,19.0,2.55,0.0,81.0,1
1,F,M,BACH,0,143,22,25,3.02,129045,2640.0,2565.0,2565,11840,12.0,,0.0,89.5,1
2,F,S,BACH,0,98,30,33,3.47,111385,2559.0,2514.0,2565,935,37.67,2.84,0.0,,1
3,F,UN,BACH,0,101,24,27,3.19,135401,2520.0,2520.0,2565,4549,6.0,,0.0,87.5,1
4,M,,SOMECOLL,0,61,19,22,3.84,115660,2520.0,2520.0,2565,1340,22.0,,3141.0,,1


In [117]:
df.dtypes

SexCode                 object
MaritalCode             object
PrevEdCode              object
DDVeteran                int64
DaysEnrollToStart        int64
AgeAtStart               int64
AgeAtGrad                int64
GPA                    float64
MinutesAttended          int64
HoursAttempt           float64
HoursEarned            float64
HoursReq                 int64
MinutesAbsent            int64
TransferCredits        float64
TransferGPA            float64
MinEFC                 float64
MaxENTEntranceScore    float64
gradFlag                 int64
dtype: object

In [118]:
df = df.dropna(subset=['GPA', 'AgeAtStart', 'TransferCredits', 'SexCode', 'MaritalCode', 'PrevEdCode'])

In [119]:
df['gradFlag'].unique()

array([1, 0])

In [120]:
df['gradFlag'] = df['gradFlag'].astype(bool)
df['gradFlag']

0        True
1        True
2        True
3        True
5        True
        ...  
2776     True
2777    False
2778     True
2779     True
2782    False
Name: gradFlag, Length: 1231, dtype: bool

In [121]:
def mann_whitney(df, column):
    # Separate the data into two groups: graduates and non-graduates
    grad = df[df['gradFlag']][column]
    non_grad = df[~df['gradFlag']][column]
    
    # Perform the Mann-Whitney U test
    statistic, p_value = mannwhitneyu(grad, non_grad, alternative='two-sided')
    return p_value

In [122]:
def t_test(df, column):
    # Separate the data into two groups: graduates and non-graduates
    grad = df[df['gradFlag']][column]
    non_grad = df[~df['gradFlag']][column]
    
    # Perform the independent t-test
    statistic, p_value = ttest_ind(grad, non_grad)
    return p_value

In [123]:
def chi_square(df, column):
    # Create a contingency table for the specified column and gradFlag
    contingency_table = pd.crosstab(df[column], df['gradFlag'])
    
    # Perform the Chi-Square test
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    return p_value

In [124]:
tests = [
    ("Median GPA", mann_whitney, 'GPA'),
    ("Mean Age at Start", t_test, 'AgeAtStart'),
    ("Median Transfer GPA", mann_whitney, 'TransferGPA'),
    ("Mean Transfer Credits", t_test, 'TransferCredits'),
    ("Gender", chi_square, 'SexCode'),
    ("Marital Status", chi_square, 'MaritalCode'),
    ("Previous Education", chi_square, 'PrevEdCode')
]

	
1. Testing for difference in medians of GPA

Null : Media GPA is the same for grads and non-grads
 
Alternate: Media GPA differs between grads and non-grads

In [126]:
test_name, test_function, column_name = tests[0]  # Access the first test
p_value = test_function(df, column_name)  # Perform the test

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Reject Null Hypothesis
P-value: 0.0000


2. Test for difference in meanas of age at start

Null: Mean age at start is the same for grads and non-grads

Alternate: Mean age at start differs between grads and non-grads

In [128]:
test_name, test_function, column_name = tests[1]  
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Fail to Reject Null Hypothesis
P-value: 0.5775


3. Test for difference in medians of transfer GPA

Null: Media transfer GPA is the same for grads and non-grads

Alternate: Media transfer GPA differs between grads and non-grads

In [130]:
test_name, test_function, column_name = tests[2] 
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Fail to Reject Null Hypothesis
P-value: nan


4. Test for difference in means of transfer credit

Null: Mean transfer credit is the same for grads and non-grads

Alternate: Mean transfer credit differ from grads and non-grads

In [132]:
test_name, test_function, column_name = tests[3]  
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Reject Null Hypothesis
P-value: 0.0000


5. Association between gender and graduation

Null: Gender is independent of graduation status

Alternative: Gender is associated with graduation


In [134]:
test_name, test_function, column_name = tests[4]  
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Reject Null Hypothesis
P-value: 0.0180


6. Association between marital status and graduation status

Null: Marital status is independent of graduation status

Alternative: Marital status is associated with graduation status

In [136]:
test_name, test_function, column_name = tests[5]  
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Reject Null Hypothesis
P-value: 0.0000


7. Association between previous education and graduation

Null: Previous education is independent of graduation status

Alternate: Previous education is associated with graduation status

In [138]:
test_name, test_function, column_name = tests[6]  
p_value = test_function(df, column_name)  

print(f"Test result: {'Reject Null Hypothesis' if p_value < 0.05 else 'Fail to Reject Null Hypothesis'}")
print(f"P-value: {p_value:.4f}")

Test result: Reject Null Hypothesis
P-value: 0.0000


### 1. Median GPA (Mann-Whitney U Test)
- **Test Type**: Non-Parametric  
- There is a significant difference in the median GPA between graduates and non-graduates.

### 2. Mean Age at Start (t-test)
- **Test Type**: Parametric  
- The mean age at the start is significantly different between graduates and non-graduates.

### 3. Median Transfer GPA (Mann-Whitney U Test)
- **Test Type**: Non-Parametric  
- There is no significant difference in the median Transfer GPA between graduates and non-graduates.

### 4. Mean Transfer Credits (t-test)
- **Test Type**: Parametric  
- There is no significant difference in the mean Transfer Credits between graduates and non-graduates.

### 5. Gender (Chi-Square Test)
- **Test Type**: Categorical Association  
- Gender is significantly associated with graduation status.

### 6. Marital Status (Chi-Square Test)
- **Test Type**: Categorical Association  
- Marital status is significantly associated with graduation status.

### 7. Previous Education (Chi-Square Test)
- **Test Type**: Categorical Association  
- Previous education is significantly associated with graduation status.