In [18]:
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
from docx import Document
import os

# Step 1: Load the dataset
# Replace with the absolute path to your dataset
dataset_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/cleaned_data.csv'
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")
cleaned_data = pd.read_csv(dataset_path)

# Step 2: Define the regression models

# Model 1: Baseline model
model_1_formula = 'SD_diff ~ Aristotle + Shock + Aristotle:Shock'
model_1 = ols(model_1_formula, data=cleaned_data).fit()

# Model 2: Add V_2 and V_diff_nonabs
model_2_formula = model_1_formula + ' + V_2 + V_diff_nonabs'
model_2 = ols(model_2_formula, data=cleaned_data).fit()

# Model 3: Add SD_1
model_3_formula = model_2_formula + ' + SD_1'
model_3 = ols(model_3_formula, data=cleaned_data).fit()

# Model 4: Add GPT_Usage and Algorithmic_Liking along with SD_1
model_4_formula = model_3_formula + ' + GPT_Usage + Algorithmic_liking'
model_4 = ols(model_4_formula, data=cleaned_data).fit()

# Model 5: Add Knowledge_Depth_1 and Knowledge_Depth_2, exclude GPT_Usage and Algorithmic_Liking
model_5_formula = model_3_formula + ' + Knowledge_Depth_1 + Knowledge_Depth_2'
model_5 = ols(model_5_formula, data=cleaned_data).fit()

# Collect models into a list
models = [model_1, model_2, model_3, model_4, model_5]
model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5']

# Step 3: Format results for all models

def format_coef_pval(coef, pval):
    return f"{coef:.4f}{'*' if pval < 0.1 else ''}"  # Add asterisk for significance

formatted_results = pd.DataFrame()
for model_name, model in zip(model_names, models):
    model_data = {
        var: format_coef_pval(coef, model.pvalues.get(var, float('nan')))
        for var, coef in model.params.items()
    }
    model_df = pd.DataFrame.from_dict(model_data, orient='index', columns=[model_name])
    formatted_results = pd.concat([formatted_results, model_df], axis=1)

# Reorder variables to match the desired order for display
variable_order = [
    "Intercept", "Aristotle", "Shock", "Aristotle:Shock",
    "V_2", "V_diff_nonabs",
    "SD_1", "GPT_Usage", "Algorithmic_liking",
    "Knowledge_Depth_1", "Knowledge_Depth_2"
]
formatted_results = formatted_results.reindex(variable_order).reset_index()
formatted_results.columns = ['Variable'] + model_names

# Step 4: Perform Variance Inflation Factor (VIF) analysis for Model 5
vif_data = cleaned_data[['Aristotle', 'Shock', 'V_2', 'V_diff_nonabs',
                         'SD_1', 'Knowledge_Depth_1', 'Knowledge_Depth_2']]
vif_values = pd.DataFrame({
    "Variable": vif_data.columns,
    "VIF": [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
})

# Step 5: Add R-squared and Adjusted R-squared values to the formatted results
rsquared_values = [model.rsquared for model in models]
adjusted_rsquared_values = [model.rsquared_adj for model in models]

rsquared_data = pd.DataFrame({
    "Variable": ["R-squared", "Adjusted R-squared"],
    **{model_name: [f"{rsq:.4f}", f"{adj_rsq:.4f}"] for model_name, rsq, adj_rsq in zip(model_names, rsquared_values, adjusted_rsquared_values)}
})

formatted_results_with_rsquared = pd.concat([formatted_results, rsquared_data], ignore_index=True)

# Step 6: Save regression results and VIF analysis to a Word document
# Replace with the desired absolute path for saving the Word document
word_file_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/newnew'

doc = Document()
doc.add_heading('Regression Results and Analysis', level=1)

# Add regression results table to the document
doc.add_heading('Regression Results', level=2)
table = doc.add_table(rows=1, cols=formatted_results_with_rsquared.shape[1])
table.style = 'Table Grid'

# Add headers
header_cells = table.rows[0].cells
for i, column_name in enumerate(formatted_results_with_rsquared.columns):
    header_cells[i].text = column_name

# Add rows
for index, row in formatted_results_with_rsquared.iterrows():
    cells = table.add_row().cells
    for i, value in enumerate(row):
        cells[i].text = str(value)

# Add VIF analysis to the document
doc.add_heading('VIF Analysis for Model 5', level=2)
vif_table = doc.add_table(rows=1, cols=vif_values.shape[1])
vif_table.style = 'Table Grid'

# Add headers for VIF table
vif_header_cells = vif_table.rows[0].cells
for i, column_name in enumerate(vif_values.columns):
    vif_header_cells[i].text = column_name

# Add rows for VIF table
for index, row in vif_values.iterrows():
    vif_cells = vif_table.add_row().cells
    for i, value in enumerate(row):
        vif_cells[i].text = str(value)

# Save the document
doc.save(word_file_path)
print(f"Document saved at: {word_file_path}")


Document saved at: /Users/zey/Desktop/COMPUTER PROGRAMMING/newnew


In [18]:
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
from docx import Document
import os

# Step 1: Load the dataset
# Replace with the absolute path to your dataset
dataset_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/cleaned_data.csv'
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")
cleaned_data = pd.read_csv(dataset_path)

# Step 2: Define the regression models

# Model 1: Baseline model
model_1_formula = 'SD_diff ~ Aristotle + Shock + Aristotle:Shock'
model_1 = ols(model_1_formula, data=cleaned_data).fit()

# Model 2: Add V_2 and V_diff_nonabs
model_2_formula = model_1_formula + ' + V_2 + V_diff_nonabs'
model_2 = ols(model_2_formula, data=cleaned_data).fit()

# Model 3: Add SD_1
model_3_formula = model_2_formula + ' + SD_1'
model_3 = ols(model_3_formula, data=cleaned_data).fit()

# Model 4: Add GPT_Usage and Algorithmic_Liking along with SD_1
model_4_formula = model_3_formula + ' + GPT_Usage + Algorithmic_liking'
model_4 = ols(model_4_formula, data=cleaned_data).fit()

# Model 5: Add Knowledge_Depth_1 and Knowledge_Depth_2, exclude GPT_Usage and Algorithmic_Liking
model_5_formula = model_3_formula + ' + Knowledge_Depth_1 + Knowledge_Depth_2'
model_5 = ols(model_5_formula, data=cleaned_data).fit()

# Collect models into a list
models = [model_1, model_2, model_3, model_4, model_5]
model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5']

# Step 3: Format results for all models

def format_coef_pval(coef, pval):
    return f"{coef:.4f}{'*' if pval < 0.1 else ''}"  # Add asterisk for significance

formatted_results = pd.DataFrame()
for model_name, model in zip(model_names, models):
    model_data = {
        var: format_coef_pval(coef, model.pvalues.get(var, float('nan')))
        for var, coef in model.params.items()
    }
    model_df = pd.DataFrame.from_dict(model_data, orient='index', columns=[model_name])
    formatted_results = pd.concat([formatted_results, model_df], axis=1)

# Reorder variables to match the desired order for display
variable_order = [
    "Intercept", "Aristotle", "Shock", "Aristotle:Shock",
    "V_2", "V_diff_nonabs",
    "SD_1", "GPT_Usage", "Algorithmic_liking",
    "Knowledge_Depth_1", "Knowledge_Depth_2"
]
formatted_results = formatted_results.reindex(variable_order).reset_index()
formatted_results.columns = ['Variable'] + model_names

# Step 4: Perform Variance Inflation Factor (VIF) analysis for Model 5
vif_data = cleaned_data[['Aristotle', 'Shock', 'V_2', 'V_diff_nonabs',
                         'SD_1', 'Knowledge_Depth_1', 'Knowledge_Depth_2']]
vif_values = pd.DataFrame({
    "Variable": vif_data.columns,
    "VIF": [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
})

# Step 5: Add R-squared and Adjusted R-squared values to the formatted results
rsquared_values = [model.rsquared for model in models]
adjusted_rsquared_values = [model.rsquared_adj for model in models]

rsquared_data = pd.DataFrame({
    "Variable": ["R-squared", "Adjusted R-squared"],
    **{model_name: [f"{rsq:.4f}", f"{adj_rsq:.4f}"] for model_name, rsq, adj_rsq in zip(model_names, rsquared_values, adjusted_rsquared_values)}
})

formatted_results_with_rsquared = pd.concat([formatted_results, rsquared_data], ignore_index=True)

# Step 6: Save regression results and VIF analysis to a Word document
# Replace with the desired absolute path for saving the Word document
word_file_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/newnew'

doc = Document()
doc.add_heading('Regression Results and Analysis', level=1)

# Add regression results table to the document
doc.add_heading('Regression Results', level=2)
table = doc.add_table(rows=1, cols=formatted_results_with_rsquared.shape[1])
table.style = 'Table Grid'

# Add headers
header_cells = table.rows[0].cells
for i, column_name in enumerate(formatted_results_with_rsquared.columns):
    header_cells[i].text = column_name

# Add rows
for index, row in formatted_results_with_rsquared.iterrows():
    cells = table.add_row().cells
    for i, value in enumerate(row):
        cells[i].text = str(value)

# Add VIF analysis to the document
doc.add_heading('VIF Analysis for Model 5', level=2)
vif_table = doc.add_table(rows=1, cols=vif_values.shape[1])
vif_table.style = 'Table Grid'

# Add headers for VIF table
vif_header_cells = vif_table.rows[0].cells
for i, column_name in enumerate(vif_values.columns):
    vif_header_cells[i].text = column_name

# Add rows for VIF table
for index, row in vif_values.iterrows():
    vif_cells = vif_table.add_row().cells
    for i, value in enumerate(row):
        vif_cells[i].text = str(value)

# Save the document
doc.save(word_file_path)
print(f"Document saved at: {word_file_path}")


Document saved at: /Users/zey/Desktop/COMPUTER PROGRAMMING/newnew


In [19]:
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import os

# Step 1: Load the dataset
# Replace with the absolute path to your dataset
dataset_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/cleaned_data.csv'
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")
cleaned_data = pd.read_csv(dataset_path)

# Step 2: Define the regression models

# Model 1: Baseline model
model_1_formula = 'SD_diff ~ Aristotle + Shock + Aristotle:Shock'
model_1 = ols(model_1_formula, data=cleaned_data).fit()

# Model 2: Add V_2 and V_diff_nonabs
model_2_formula = model_1_formula + ' + V_2 + V_diff_nonabs'
model_2 = ols(model_2_formula, data=cleaned_data).fit()

# Model 3: Add SD_1
model_3_formula = model_2_formula + ' + SD_1'
model_3 = ols(model_3_formula, data=cleaned_data).fit()

# Model 4: Add GPT_Usage and Algorithmic_Liking along with SD_1
model_4_formula = model_3_formula + ' + GPT_Usage + Algorithmic_liking'
model_4 = ols(model_4_formula, data=cleaned_data).fit()

# Model 5: Add Knowledge_Depth_1 and Knowledge_Depth_2, exclude GPT_Usage and Algorithmic_Liking
model_5_formula = model_3_formula + ' + Knowledge_Depth_1 + Knowledge_Depth_2'
model_5 = ols(model_5_formula, data=cleaned_data).fit()

# Collect models into a list
models = [model_1, model_2, model_3, model_4, model_5]
model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5']

# Step 3: Print regression results
print("\n=== Regression Results ===")
for name, model in zip(model_names, models):
    print(f"\n{name}")
    print(model.summary())

# Step 4: Perform Variance Inflation Factor (VIF) analysis for Model 5
vif_data = cleaned_data[['Aristotle', 'Shock', 'V_2', 'V_diff_nonabs',
                         'SD_1', 'Knowledge_Depth_1', 'Knowledge_Depth_2']]
vif_values = pd.DataFrame({
    "Variable": vif_data.columns,
    "VIF": [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
})

# Step 5: Print VIF values
print("\n=== Variance Inflation Factor (VIF) ===")
print(vif_values)



=== Regression Results ===

Model 1
                            OLS Regression Results                            
Dep. Variable:                SD_diff   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                 -0.026
Method:                 Least Squares   F-statistic:                    0.4748
Date:                Thu, 28 Nov 2024   Prob (F-statistic):              0.701
Time:                        16:02:36   Log-Likelihood:                -78.495
No. Observations:                  63   AIC:                             165.0
Df Residuals:                      59   BIC:                             173.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Inter

In [20]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np

# Example: Assuming `data` is your DataFrame with independent variables
def calculate_vif_with_constant(data):
    """Calculate VIF, including a constant (intercept)."""
    # Add a constant column
    data_with_constant = data.copy()
    data_with_constant['Intercept'] = 1

    # Calculate VIF for all variables including the constant
    vif_data = pd.DataFrame()
    vif_data["Variable"] = data_with_constant.columns
    vif_data["VIF"] = [
        variance_inflation_factor(data_with_constant.values, i)
        for i in range(data_with_constant.shape[1])
    ]
    return vif_data

# Example Usage
# Assuming `cleaned_data` contains the independent variables for the regression
independent_vars = cleaned_data[['Aristotle', 'Shock', 'V_2', 'V_diff_nonabs', 
                                  'SD_1', 'Knowledge_Depth_1', 'Knowledge_Depth_2']]
vif_results_with_constant = calculate_vif_with_constant(independent_vars)

# Print the VIF results
print(vif_results_with_constant)

            Variable        VIF
0          Aristotle   1.093869
1              Shock   1.158148
2                V_2   1.550890
3      V_diff_nonabs   1.364199
4               SD_1   1.396106
5  Knowledge_Depth_1   1.407039
6  Knowledge_Depth_2   1.338436
7          Intercept  66.818094


In [22]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

# Function to calculate VIF
def calculate_vif(X):
    """
    Calculate Variance Inflation Factor (VIF) for a DataFrame of predictors, including the constant.
    """
    X = sm.add_constant(X)  # Add intercept (constant) for VIF calculation
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 1: Load the dataset
dataset_path = r'/Users/zey/Desktop/COMPUTER PROGRAMMING/cleaned_data.csv'
cleaned_data = pd.read_csv(dataset_path)

# Step 2: Define the regression models

# Model 1: Baseline model
model_1_formula = 'SD_diff ~ Aristotle + Shock + Aristotle:Shock'
model_1 = ols(model_1_formula, data=cleaned_data).fit()

# Model 2: Add V_2 and V_diff_nonabs
model_2_formula = model_1_formula + ' + V_2 + V_diff_nonabs'
model_2 = ols(model_2_formula, data=cleaned_data).fit()

# Model 3: Add SD_1
model_3_formula = model_2_formula + ' + SD_1'
model_3 = ols(model_3_formula, data=cleaned_data).fit()

# Model 4: Add GPT_Usage and Algorithmic_Liking along with SD_1
model_4_formula = model_3_formula + ' + GPT_Usage + Algorithmic_liking'
model_4 = ols(model_4_formula, data=cleaned_data).fit()

# Model 5: Add Knowledge_Depth_1 and Knowledge_Depth_2, exclude GPT_Usage and Algorithmic_Liking
model_5_formula = model_3_formula + ' + Knowledge_Depth_1 + Knowledge_Depth_2'
model_5 = ols(model_5_formula, data=cleaned_data).fit()

# Collect models into a list
models = [model_1, model_2, model_3, model_4, model_5]
model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5']

# Step 3: Format results for all models
print("\n=== Regression Results ===")
for name, model in zip(model_names, models):
    print(f"\n{name}")
    print(model.summary())

# Step 4: Perform Variance Inflation Factor (VIF) analysis for Model 5
independent_vars = cleaned_data[['Aristotle', 'Shock', 'V_2', 'V_diff_nonabs',
                                  'SD_1', 'Knowledge_Depth_1', 'Knowledge_Depth_2']]
vif_results = calculate_vif(independent_vars)

# Print the VIF results
print("\n=== Variance Inflation Factor (VIF) with Intercept ===")
print(vif_results)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject