In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import t, pearsonr, spearmanr, kendalltau
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis 
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
sns.set()

df_lu = pd.read_csv('lu_selected.csv')
df_nlu = pd.read_csv('nlu_selected.csv')

In [2]:
def bootstrap_on_extracted_columns(df, column_names, num_samples=1000):
    # Extract the specified columns into a new dataframe
    extracted_df = df[column_names].copy()
    
    # Convert all values to floats in the extracted dataframe
    extracted_df = extracted_df.astype(float)
    
    # Initialize an empty list to store the bootstrapped sample statistics
    bootstrap_statistics = []
    
    # Get the number of rows in the extracted dataframe
    num_rows = extracted_df.shape[0]
    
    # Perform bootstrapping
    for _ in range(num_samples):
        # Generate random indices with replacement for bootstrap sampling
        bootstrap_indices = np.random.choice(num_rows, size=num_rows, replace=True)
        
        # Sample from the extracted dataframe using the bootstrap indices
        bootstrap_sample = extracted_df.iloc[bootstrap_indices]
        
        # Calculate the statistic of interest (e.g., mean, median, etc.) for the bootstrap sample
        # For demonstration purposes, let's calculate the mean of each column in the bootstrap sample
        bootstrap_statistics.append(bootstrap_sample.mean())
    
    # Create a new dataframe to store the bootstrap statistics
    bootstrap_stats_df = pd.DataFrame(bootstrap_statistics)
    
    return bootstrap_stats_df

In [3]:
selected_columns = ['Plasmid Size (bp)', 'Total Paste Weight', 'Specific Yield', 'Percent Monomer', 
'Percent Dimer', 'Percent Trimer+', 'Total Thaw Time (min)', 'LAD1-1 Conductivity (mS/cm)', 'LAD2-1 Conductivity (mS/cm)', 
'LAD3-1 Conductivity (mS/cm)', 'AEX1 Load Challenge (g pDNA/L resin)', 'COA gDNA %', 'gDNA % (CTAC)', 'Final g (Delivered)']
lu_selected = bootstrap_on_extracted_columns(df_lu, selected_columns, num_samples=1000)
lu_selected.dropna(inplace=True)
nlu_selected = bootstrap_on_extracted_columns(df_nlu, selected_columns, num_samples=1000)
nlu_selected.dropna(inplace=True)

In [4]:
def run_linear_regression(data, feature_columns):
    all_results = []
    
    for target_column in feature_columns:
        target_values = data[target_column].values
        
        results = []
        for feature_column in feature_columns:
            if feature_column != target_column:  # Avoid subtracting the same column
                feature_values = data[[feature_column, target_column]].values
                X = feature_values[:, 0].reshape(-1, 1)
                y = feature_values[:, 1]
                
                model = LinearRegression()
                model.fit(X, y)
                
                result = {
                    'Feature': feature_column,
                    'Target': target_column,
                    'Coefficient': model.coef_[0],
                    'Intercept': model.intercept_,
                    'R-squared': model.score(X, y)
                }
                
                results.append(result)
        
        all_results.extend(results)
    
    results_df = pd.DataFrame(all_results)
    return results_df

In [5]:
df_lu_corr = run_linear_regression(lu_selected, selected_columns)
df_nlu_corr = run_linear_regression(nlu_selected, selected_columns)

In [6]:
df_lu_sorted = df_lu_corr.set_index(['Target', 'Feature'])
df_nlu_sorted = df_nlu_corr.set_index(['Target', 'Feature'])

In [7]:
df_COA_lu = df_lu_corr[df_nlu_corr['Target'] == 'COA gDNA %']
df_COA_lu.set_index(['Target', 'Feature'], inplace=True)

df_COA_nlu = df_nlu_corr[df_nlu_corr['Target'] == 'COA gDNA %']
df_COA_nlu.set_index(['Target', 'Feature'], inplace=True)

df_COA_nlu.to_excel('results.xlsx', index=False)
df_COA_lu.to_excel('results.xlsx', index=False)


In [None]:
print(df_lu_sorted['R-squared'].mean())
print(df_nlu_sorted['R-squared'].mean())

In [9]:
def find_top_values(dataframe, column_name):
    top_values_df = dataframe.nlargest(20, column_name)
    return top_values_df

df_nlu_10 = find_top_values(df_nlu_corr, 'R-squared')
df_lu_10 = find_top_values(df_lu_corr, 'R-squared')

In [10]:
def removes_duplicates(dataframe):
    results = dataframe.reset_index(drop=True)
    results = dataframe.drop(dataframe.index[1::2])

    return results

In [11]:
nlu_results = removes_duplicates(df_nlu_10)
lu_results = removes_duplicates(df_lu_10)

In [12]:
nlu_results.to_excel('results.xlsx', index=False)
lu_results.to_excel('results.xlsx', index=False)

In [None]:
display(nlu_results)
display(lu_results)