# What's being done here:
1. Set the Working Directory: Set and verify the working directory.
2. Load the Dataset: Load the dataset and display the first few rows.
3. Define Variables: Define the target variable, control variables, and independent variables.
4. Check for Linear Relationship: Use scatter plots and a correlation matrix.
5. Check for Multicollinearity: Calculate Variance Inflation Factor (VIF) for each feature, including control variables.
6. Check for Homoscedasticity: Plot residuals vs. fitted values.
7. Check for Normality of Residuals: Use histogram, Q-Q plot, and Shapiro-Wilk test.
8. Check for Autocorrelation: Use the Durbin-Watson test.
9. Check for Independence of Errors: Note the importance of random sampling.
10. Model Summary: Print the regression model summary.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats

# Set the working directory
working_directory = 'your_directory_path'  # Replace with your desired directory path
os.chdir(working_directory)

# Verify the current working directory
print("Current Working Directory:", os.getcwd())

In [None]:
# Load the dataset
# df = pd.read_csv('your_dataset.csv')  # Load your dataset here
# df = pd.read_excel('your_dataset.xlsx')  # Alternatively, load an Excel file

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

In [None]:
# Define the target variable and control variables
target_variable = 'your_target_variable'  # Replace with your target variable
control_variables = ['control_var1', 'control_var2']  # Replace with your control variables
independent_variables = [col for col in df.columns if col not in [target_variable] + control_variables]

In [None]:
# Checking for Linear Relationship
# Scatter plot and correlation matrix
for feature in independent_variables:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=df[feature], y=df[target_variable])
    plt.title(f'{feature} vs {target_variable}')
    plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Checking for Multicollinearity
# Calculate VIF for each feature
X = df[independent_variables + control_variables]
X = sm.add_constant(X)  # Adding a constant for intercept
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

In [None]:
# Checking for Homoscedasticity
# Residuals plot
X = df[independent_variables + control_variables]
X = sm.add_constant(X)  # Adding a constant for intercept
y = df[target_variable]
model = sm.OLS(y, X).fit()
residuals = model.resid
fitted = model.fittedvalues

plt.figure(figsize=(10, 6))
sns.scatterplot(x=fitted, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Fitted Values')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Checking for Normality of Residuals
# Histogram and Q-Q plot
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.show()

sm.qqplot(residuals, line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()

# Shapiro-Wilk test for normality
shapiro_test = stats.shapiro(residuals)
print(f"\nShapiro-Wilk test: W={shapiro_test[0]}, p-value={shapiro_test[1]}")

In [None]:
# Checking for Autocorrelation
# Durbin-Watson test
durbin_watson = sm.stats.durbin_watson(residuals)
print(f"\nDurbin-Watson statistic: {durbin_watson}")

In [None]:
# Checking for Independence of Errors
# This is often domain-specific and harder to test statistically. Ensuring random sampling is a good practice.

In [None]:
# Summary of the model
print("\nRegression Model Summary:")
print(model.summary())