In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import t
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
sns.set()

In [None]:
df_nlu = pd.read_csv('non-lock_up.csv')
df_lu = pd.read_csv('lock_up.csv')
df_nlu.drop(columns=df_nlu.columns[99:], inplace=True)
df_lu.drop(columns=df_lu.columns[99:], inplace=True)
display(df_nlu)
display(df_lu)

Calculating P_Value

In [None]:
def calculate_p_values(dataframe1, dataframe2):
    p_values = {}

    # Combine the columns from both dataframes
    all_columns = set(dataframe1.columns).union(dataframe2.columns)

    for column in all_columns:
        # Check if the column exists in both dataframes and contains numeric data
        if column in dataframe1.columns and column in dataframe2.columns:
            if pd.api.types.is_numeric_dtype(dataframe1[column]) and pd.api.types.is_numeric_dtype(dataframe2[column]):
                # Drop NaN values from both dataframes for the current column
                numeric_values1 = dataframe1[column].dropna()
                numeric_values2 = dataframe2[column].dropna()

                # Perform the t-test if there are enough numeric values in both dataframes
                if len(numeric_values1) > 1 and len(numeric_values2) > 1:
                    _, p_value = stats.ttest_ind(numeric_values1, numeric_values2, equal_var=False)
                    p_values[column] = p_value

    result_df = pd.DataFrame.from_dict(p_values, orient='index', columns=['p_value'])
    result_df = result_df.T

    return result_df

p_values = calculate_p_values(df_nlu, df_lu)
display(p_values)

Sorting for Statistically Significant P_Value

In [None]:
#Getting the statistically significant resutls
sorted_p_values = pd.DataFrame()

# Iterate through each column of the original DataFrame
for column in p_values.columns:
    # Check if all values in the column are less than 0.05
    if (p_values[column] < 0.05).all():
        # Add the column to the new DataFrame
        sorted_p_values[column] = p_values[column]

display(sorted_p_values)

Calculating Hedges_G Values

In [None]:
def hedges_g(dataframe1, dataframe2):
    hedges_g_values = {}

    # Combine the columns from both dataframes
    all_columns = set(dataframe1.columns).union(dataframe2.columns)

    for column in all_columns:
        # Check if the column exists in both dataframes and contains numeric data
        if column in dataframe1.columns and column in dataframe2.columns:
            if pd.api.types.is_numeric_dtype(dataframe1[column]) and pd.api.types.is_numeric_dtype(dataframe2[column]):
                # Drop NaN values from both dataframes for the current column
                numeric_values1 = dataframe1[column].dropna()
                numeric_values2 = dataframe2[column].dropna()

                # Perform the hedges-g if there are enough numeric values in both dataframes
                if len(numeric_values1) > 1 and len(numeric_values2) > 1:
                    n1 = len(numeric_values1)
                    n2 = len(numeric_values2)
                    s1 = np.std(numeric_values1, ddof=1)
                    s2 = np.std(numeric_values2, ddof=1)

                    pooled_sd = np.sqrt(((n1 - 1) * s1 ** 2 + (n2 - 1) * s2 ** 2) / (n1 + n2 - 2))
                    dof = n1 + n2 - 2

                    numerator = np.mean(numeric_values1) - np.mean(numeric_values2)
                    denominator = pooled_sd * np.sqrt((1 / n1 + 1 / n2))

                    g_value = abs(numerator / denominator)
                    hedges_g_values[column] = g_value

    result_df = pd.DataFrame.from_dict(hedges_g_values, orient='index', columns=['hedges_g'])
    result_df = result_df.T

    return result_df

hedges_results = hedges_g(df_nlu, df_lu)

display(hedges_results)

Sorting Hedges_G Values

In [11]:
small_effect = []
medium_effect = []
large_effect = []

# Loop through each column in hedges_results
for column in hedges_results.columns:
    # Filter for small effect size (absolute value less than 0.2)
    if hedges_results[column].apply(lambda x: abs(x) < 0.2).all():
        small_effect.append(column)
    # Filter for medium effect size (absolute value between 0.2 and 0.8)
    elif hedges_results[column].apply(lambda x: 0.2 <= abs(x) < 0.8).all():
        medium_effect.append(column)
    # Filter for large effect size (absolute value greater than or equal to 0.8)
    else:
        large_effect.append(column)

In [None]:
input = {'Large Effect':large_effect, 'Medium Effect': medium_effect, 'Small Effect': small_effect}

max_length = max(len(v) for v in input.values())

# Fill in empty cells with NaN
for key in input.keys():
    input[key] = input[key] + [float('nan')] * (max_length - len(input[key]))

hedges_g_results = pd.DataFrame(input)

display(hedges_g_results)