In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
data_path = '/content/drive/MyDrive/DSc_Project_Sem7/data/'

In [3]:
import pandas as pd
from scipy import stats
import numpy as np

In [4]:
df = pd.read_csv(data_path + 'pre_processed_data.csv')

In [5]:
def make_conclusion(alpha, phase = ''):
  if alpha < 0.05:
    print(f'{alpha} < 0.05: NULL hypothesis REJECTED in {phase} phase')
  else:
    print(f'{alpha} >= 0.05: NULL hypothesis ACCEPTED in {phase} phase')

Correlation Testing

In [6]:
def df_correlation_testing(feature_1, feature_2):
  # print(df.corr())
  print(feature_1.corr(feature_2))

  return feature_1.corr(feature_2)

In [7]:
import math

In [8]:
def stats_correlation_testing(feature_1, feature_2):
  corr_val, corr_alpha = stats.pearsonr(feature_1, feature_2)
  print(f'Correlation Value: {corr_val}, Alpha: {corr_alpha}')

  t = (corr_val * math.sqrt(len(feature_1) - 2)) / math.sqrt(1 - corr_val ** 2)
  print(t, len(feature_1) - 2)

  return corr_val, corr_alpha

In [9]:
def perform_correlation_testing(feature_1, feature_2, sample_size = 3500):
  indices = np.random.choice(len(feature_1), sample_size, replace=False)
  test_set_feature_1 = feature_1[indices]
  test_set_feature_2 = feature_2[indices]

  # test_df_corr = df_correlation_testing(test_set_feature_1, test_set_feature_2)
  test_stat_corr_val, test_stat_corr_alpha = stats_correlation_testing(test_set_feature_1, test_set_feature_2)
  make_conclusion(test_stat_corr_alpha, 'testing')

  # validate_df_corr = df_correlation_testing(feature_1, feature_2)
  validate_stat_corr_val, validate_stat_corr_alpha = stats_correlation_testing(feature_1, feature_2)
  make_conclusion(validate_stat_corr_alpha, 'validation')



*   NULL Hypothesis: corr val = 0 --> INDEPENDENT
*   Alternate Hypothesis: corr val =/= 0 --> DEPENDENT / CORRELATED



In [10]:
perform_correlation_testing(df.AVE_LOANAMT.values, df.AVE_PROPVAL.values)

Correlation Value: 0.9767609461995553, Alpha: 0.0
269.5329673696587 3498
0.0 < 0.05: NULL hypothesis REJECTED in testing phase
Correlation Value: 0.9770737986403297, Alpha: 0.0
888.0454526278678 37443
0.0 < 0.05: NULL hypothesis REJECTED in validation phase


In [11]:
df['TOT_ORIG'] = df['TOT_ORIG'].fillna(df['TOT_ORIG'].mean())

In [12]:
perform_correlation_testing(df.AVE_INTRATE.values, df.TOT_ORIG.values)

Correlation Value: 0.024649968270261734, Alpha: 0.1448371244946625
1.45833819505892 3498
0.1448371244946625 >= 0.05: NULL hypothesis ACCEPTED in testing phase
Correlation Value: 0.027364049475720357, Alpha: 1.183973808640816e-07
5.296980141841564 37443
1.183973808640816e-07 < 0.05: NULL hypothesis REJECTED in validation phase


In [13]:
perform_correlation_testing(df.TOT_ORIG.values, df.AVE_INTRATE.values)

Correlation Value: 0.0047083353131727405, Alpha: 0.7806662368289622
0.27847236348156945 3498
0.7806662368289622 >= 0.05: NULL hypothesis ACCEPTED in testing phase
Correlation Value: 0.027364049475720357, Alpha: 1.183973808640816e-07
5.296980141841564 37443
1.183973808640816e-07 < 0.05: NULL hypothesis REJECTED in validation phase


MannWhitney U Test

In [14]:
def stats_mannwhitneyU_testing(feature_1, feature_2):
  val, alpha = stats.mannwhitneyu(feature_1, feature_2)
  print(f'MannWhitneyU Value: {val}, Alpha: {alpha}')

  return val, alpha

In [15]:
def perform_mannwhitneyU_testing(feature_1, feature_2, sample_size = 3400):
  indices = np.random.choice(len(feature_1), sample_size, replace=False)
  test_set_feature_1 = feature_1[indices]
  test_set_feature_2 = feature_2[indices]

  test_stat_corr_val, test_stat_corr_alpha = stats_mannwhitneyU_testing(test_set_feature_1, test_set_feature_2)
  make_conclusion(test_stat_corr_alpha, 'testing')

  validate_stat_val, validate_stat_alpha = stats_mannwhitneyU_testing(feature_1, feature_2)
  make_conclusion(validate_stat_alpha, 'validation')


T-Test

In [16]:
def stats_t_testing(feature, p0):
  t_val, t_alpha = stats.ttest_1samp(feature, p0)
  print(f'T Value: {t_val}, Alpha: {t_alpha}')

  return t_val, t_alpha

In [17]:
def perform_t_testing(feature, p0 = 50, sample_size = 3500):
  test_set_feature = np.random.choice(feature, sample_size)

  test_stat_val, test_stat_alpha = stats_t_testing(test_set_feature, p0)
  make_conclusion(test_stat_alpha, 'testing')

  validate_stat_val, validate_stat_alpha = stats_t_testing(feature, p0)
  make_conclusion(validate_stat_alpha, 'validation')
  print(np.mean(test_set_feature), feature.mean())

  _, validate_stat_alpha = stats_t_testing(test_set_feature, feature.mean())
  make_conclusion(validate_stat_alpha, 'sample vs pop')

In [22]:
perform_t_testing(df.AVE_LTV, 75)

T Value: 1.6855948615908338, Alpha: 0.09196302323982113
0.09196302323982113 >= 0.05: NULL hypothesis ACCEPTED in testing phase
T Value: 7.629662690193031, Alpha: 2.4093454201861012e-14
2.4093454201861012e-14 < 0.05: NULL hypothesis REJECTED in validation phase
75.26950208712405 75.37621748902923
T Value: -0.6674491282183207, Alpha: 0.5045293098464224
0.5045293098464224 >= 0.05: NULL hypothesis ACCEPTED in sample vs pop phase


In [19]:
perform_t_testing(df.AVE_AGE_BORROWER, 43)

T Value: 20.706300044159544, Alpha: 6.0956128474371725e-90
6.0956128474371725e-90 < 0.05: NULL hypothesis REJECTED in testing phase
T Value: 67.87150582237888, Alpha: 0.0
0.0 < 0.05: NULL hypothesis REJECTED in validation phase
44.451192061283464 44.45317349887617
T Value: -0.028272096029123606, Alpha: 0.9774467481665218
0.9774467481665218 >= 0.05: NULL hypothesis ACCEPTED in sample vs pop phase


In [20]:
perform_t_testing(df.AVE_LOANAMT)

T Value: 56.23194124991072, Alpha: 0.0
0.0 < 0.05: NULL hypothesis REJECTED in testing phase
T Value: 180.2044185311693, Alpha: 0.0
0.0 < 0.05: NULL hypothesis REJECTED in validation phase
242.20505411232241 242.11489885475754
T Value: 0.026376024138241052, Alpha: 0.9789589215761864
0.9789589215761864 >= 0.05: NULL hypothesis ACCEPTED in sample vs pop phase
