In [None]:
import polars as pl
from scipy.stats import pearsonr, stats

In [None]:
df = pl.read_csv("data_full.csv")

In [None]:
# Identify numerical columns
numerical_columns = [col for col, dtype in df.schema.items() if dtype in [pl.Int64, pl.Float64]]

# Dictionary to store correlation results
correlation_results = {}

# Calculate correlations
for column in numerical_columns:
    if column != 'total_pimples':  # Avoid self-correlation
        # Extract the column data as lists
        col_data = df[column].to_list()
        pimples_data = df['total_pimples'].to_list()
        
        # Calculate Pearson correlation
        correlation_coef, p_value = stats.pearsonr(col_data, pimples_data)
        correlation_results[column] = {'correlation_coef': correlation_coef, 'p_value': p_value}

# Sort results by p-value in descending order
sorted_results = sorted(correlation_results.items(), key=lambda x: x[1]['p_value'], reverse=True)

# Print sorted results
for column, result in sorted_results:
    print(f"Column: {column}, Correlation Coefficient: {result['correlation_coef']:.4f}, p-value: {result['p_value']:.4f}")


In [None]:
# Function to calculate Pearson correlation coefficient
def calculate_pearson_correlation(x, y):
    correlation_coef, _ = pearsonr(x, y)
    return correlation_coef

# Calculate Pearson correlation coefficients with total_pimples
correlation_results = {}
for column in df.columns:
    if column != 'total_pimples' and df[column].dtype in [pl.Int64, pl.Float64]:
        correlation_coef = calculate_pearson_correlation(df[column].to_list(), df['total_pimples'].to_list())
        correlation_results[column] = correlation_coef

# Sort correlation results in descending order of absolute values
sorted_results = sorted(correlation_results.items(), key=lambda x: abs(x[1]), reverse=True)

# Print sorted results
print("Correlation with total_pimples:")
for column, correlation_coef in sorted_results:
    print(f"Column: {column}, Correlation Coefficient: {correlation_coef:.4f}")