In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
import seaborn as sns

In [None]:
df = pd.read_csv('writing_center.csv')

In [None]:
df.iloc[:,:15]

In [None]:
df.iloc[:, 15:29]

## Sucess is predicted by instructor

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('instructor_encoder', OneHotEncoder(), ['Instructor_ID'])
    ],
    remainder='passthrough'  # Keep any remaining columns
)

# Create a pipeline with the column transformer and logistic regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the model on the entire dataset
model.fit(df[['Instructor_ID']], df['Main_Course_SuccessFlag'])

# Print the coefficient estimates
coefficients = model.named_steps['classifier'].coef_
intercept = model.named_steps['classifier'].intercept_


In [None]:
coefficients

## Success predicted by writing center is different for each instructor

In [None]:
def model_coefs(features):

    coefficients = []
    errors = []
    sizes = []

    # Loop through unique instructor IDs
    for ID in df['Instructor_ID'].unique().tolist():
        df_in = df[df['Instructor_ID'] == ID]
        sizes.append(df_in.shape[0])

        # Create a logistic regression model
        model = LogisticRegression()

        model.fit(df_in[features], df_in['Main_Course_SuccessFlag'])

        # Predict the probabilities
        probabilities = model.predict_proba(df_in[features])

        # Calculate log loss
        current_log_loss = log_loss(df_in['Main_Course_SuccessFlag'], probabilities)

        # Store coefficients and errors
        coefficients.append(model.coef_[0][0])
        errors.append(current_log_loss)
        
    return coefficients, errors, sizes

In [None]:
def bootstrap(data):

    #bootstrap coefficients
    B = 10000

    bootstrapped_means = np.zeros(B)
    bootstrapped_std = np.zeros(B)

    # Perform bootstrapping
    for i in range(B):
        # Generate a bootstrap sample by sampling with replacement
        bootstrap_sample = np.random.choice(data, size=len(data), replace=True)
        
        # Calculate the mean of the bootstrap sample
        bootstrapped_means[i] = np.mean(bootstrap_sample)
        bootstrapped_std[i] = np.std(bootstrap_sample)
        
    return bootstrapped_means, bootstrapped_std

In [None]:
coefficients_reg, errors_reg, size_reg = model_coefs(['WR_Center'])
coefficients_cov, errors_cov, size_cov = model_coefs(['WR_Center', 'Age', 'Military', 'FirstGen', 'FosterYouth', 'DSPS', 'FinAid', 'K12_Student', 'First_Time_College_Student', 'International', 'Nonresident'])

In [None]:
plt.hist(coefficients_reg, alpha=0.5)
plt.hist(coefficients_cov, alpha=0.5)
print(np.mean(coefficients_reg))
print(np.mean(coefficients_cov))

In [None]:
bootstrap_means_reg, bootstrap_std_reg = bootstrap(coefficients_reg)
bootstrap_means_cov, bootstrap_std_cov =  bootstrap(coefficients_cov)

In [None]:
sns.kdeplot(bootstrap_means_reg, label='No covariates', fill=True)
sns.kdeplot(bootstrap_means_cov, label='With covariates', fill=True)
plt.legend()
plt.title('Bootstrap Means of coefficient of score ~ writing_center, instructor no pooling')

In [None]:
sns.kdeplot(bootstrap_std_reg, label='No covariates', fill=True)
sns.kdeplot(bootstrap_std_cov, label='With covariates', fill=True)
plt.legend()
plt.title('Bootstrap STD of coefficient of score ~ writing_center, instructor no pooling')

In [None]:
plt.hist(errors_reg, alpha=0.5)
plt.hist(errors_cov, alpha=0.5)

## Gender

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

coefficients_gen = []
errors_gen = []
sizes_gen = []

# Loop through unique instructor IDs
for ID in df['Instructor_ID'].unique().tolist():
    df_in = df[df['Instructor_ID'] == ID]
    sizes_gen.append(df_in.shape[0])

    # Create a logistic regression model
    model = LogisticRegression()

    model.fit(df_in['Gender'].array.reshape(-1,1), df_in['Main_Course_SuccessFlag'])

    # Predict the probabilities
    probabilities = model.predict_proba(df_in['Gender'].array.reshape(-1,1))

    # Calculate log loss
    current_log_loss = log_loss(df_in['Main_Course_SuccessFlag'], probabilities)

    # Store coefficients and errors
    coefficients_gen.append(model.coef_[0][0])
    errors_gen.append(current_log_loss)

In [None]:
plt.hist(coefficients_gen)

In [None]:
# Create a logistic regression model
model = LogisticRegression()

model.fit(df['Gender'].array.reshape(-1,1), df['Main_Course_SuccessFlag'])

# Predict the probabilities
probabilities = model.predict_proba(df['Gender'].array.reshape(-1,1))

# Calculate log loss
log_loss_grand = log_loss(df['Main_Course_SuccessFlag'], probabilities)

# Store coefficients and errors
coef_grand = model.coef_[0][0]

In [None]:
 plt.hist(coefficients_gen)
 plt.axvline(x=coef_grand, color = 'red')

In [None]:
plt.hist(errors_gen)
plt.axvline(x=log_loss_grand, color = 'red')

In [None]:
print('complete pooling loss:', log_loss_grand)
print('no pooling', np.mean(errors_gen))