In [None]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [42]:
df = pd.read_csv("tmp/PUSHdata_Dev_4.14.24.csv")

In [43]:
beta = 0.09
x_data_column = "whrs_mc"
x2_data_column = "blng_mc"
y_data_column = "psqitotal"
point_color = "#f6b26b"
line_color = "#ab99e2"
x_axis_title = "Mean Centered Hours Worked and Belongingness"
y_axis_title = "Subjective Sleep"
figure_title = "Hours Worked and Belongingness as Predictors of Subjective Sleep"
width = 800
height = 550

In [44]:
# Convert x_data_column and y_data_column to float, set to None if cannot convert
def safe_float(val):
    try:
        return float(val)
    except (ValueError, TypeError):
        return None

df[x_data_column] = df[x_data_column].apply(safe_float)
df[x2_data_column] = df[x2_data_column].apply(safe_float)  # Fixed: was using x_data_column
df[y_data_column] = df[y_data_column].apply(safe_float)


In [None]:
# Remove rows with NaN values for regression
df1_clean = df[[x_data_column, y_data_column]].dropna()

# Fit linear regression
X = df1_clean[[x_data_column]].values
y = df1_clean[y_data_column].values

reg = LinearRegression()
reg.fit(X, y)

# Calculate regression line
x1_line = np.array([df1_clean[x_data_column].min(), df1_clean[x_data_column].max()])
y1_line = reg.predict(x1_line.reshape(-1, 1))

# Calculate R-squared
r2 = r2_score(y, reg.predict(X))

In [None]:
# Remove rows with NaN values for regression
df2_clean = df[[x2_data_column, y_data_column]].dropna()

# Fit linear regression
X = df2_clean[[x2_data_column]].values
y = df2_clean[y_data_column].values

reg = LinearRegression()
reg.fit(X, y)

# Calculate regression line
x2_line = np.array([df2_clean[x2_data_column].min(), df2_clean[x2_data_column].max()])
y2_line = reg.predict(x2_line.reshape(-1, 1))

# Calculate R-squared
r2 = r2_score(y, reg.predict(X))

In [None]:
# Create a basic scatter plot
fig = go.Figure()

# Add scatter trace
fig.add_trace(go.Scatter(
    x=df1_clean[x_data_column],
    y=df1_clean[y_data_column],
    mode='markers',
    marker=dict(
        color=point_color,
        symbol='circle'
    ),
    name='Hours Worked'
))
# Add scatter trace
fig.add_trace(go.Scatter(
    x=df2_clean[x2_data_column],
    y=df2_clean[y_data_column],
    mode='markers',
    marker=dict(
        color=point_color,
        symbol='square'
    ),
    name='Belongingness'
))

# Add regression line
fig.add_trace(go.Scatter(
    x=x1_line,
    y=y1_line,
    mode='lines',
    line=dict(color=line_color, width=5),
    name=f'Hours Worked LR (b* = {beta})'
))

# Add regression line
fig.add_trace(go.Scatter(
    x=x2_line,
    y=y2_line,
    mode='lines',
    line=dict(color=line_color, width=5),
    name=f'Belongingness LR (b* = {beta})'
))

# Add label with regression equation
slope = reg.coef_[0]
intercept = reg.intercept_
fig.add_annotation(
    x=x2_line[1],
    y=y2_line[1],
    text=f"<b>b* = {beta}</b>",
    showarrow=False,
    xanchor="right",
    yanchor="bottom",
    font=dict(color=line_color, size=26),
    yshift=12  # shift annotation upward
)


# Update layout
fig.update_layout(
    title=dict(
        text=f"<b>{figure_title}</b>",
        font=dict(color='black', size=28),
        x=0.5
    ),
    xaxis_title=x_axis_title,
    yaxis_title=y_axis_title,
    width=width ,
    height=height,
    showlegend=True,
    legend=dict(
        x=0.02,
        y=0.98,
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='black',
        borderwidth=1
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)
# Edit layout
fig.update_xaxes(
    showgrid=False, 
    showline=True, 
    zeroline=False, 
    linecolor='black',
    title_font=dict(color='black', size=24),
    tickfont=dict(color='black', size=20)
)
fig.update_yaxes(
    showgrid=False, 
    showline=True, 
    zeroline=False, 
    linecolor='black',
    title_font=dict(color='black', size=24),
    tickfont=dict(color='black', size=20)
)

# Show the plot
fig.show()

In [None]:
# Debug: Check data availability
print("Hours Worked data:")
print(f"  Column: {x_data_column}")
print(f"  Non-null values: {df[x_data_column].notna().sum()}")
print(f"  Range: {df[x_data_column].min():.2f} to {df[x_data_column].max():.2f}")

print("\nBelongingness data:")
print(f"  Column: {x2_data_column}")
print(f"  Non-null values: {df[x2_data_column].notna().sum()}")
print(f"  Range: {df[x2_data_column].min():.2f} to {df[x2_data_column].max():.2f}")

print("\nSleep data:")
print(f"  Column: {y_data_column}")
print(f"  Non-null values: {df[y_data_column].notna().sum()}")
print(f"  Range: {df[y_data_column].min():.2f} to {df[y_data_column].max():.2f}")

print(f"\nClean Hours Worked data points: {len(df1_clean)}")
print(f"Clean Belongingness data points: {len(df2_clean)}")
