In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import shapiro, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSVs
wb_df = pd.read_csv('finalcompileddatawb.csv')
up_df = pd.read_csv('finalcompileddataup.csv')
bihar_df = pd.read_csv('finalcompileddatabihar.csv')

# Standardize column names
for df in [wb_df, up_df, bihar_df]:
    df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('(', '').str.replace(')', '')

# Clean Year and Yield columns
def preprocess(df, state):
    df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})').astype(float)
    df['Yield'] = pd.to_numeric(df['YieldofWheatkg/Hectare'], errors='coerce')
    df = df.dropna(subset=['Year', 'Yield'])
    return df

wb_df = preprocess(wb_df, 'West Bengal')
up_df = preprocess(up_df, 'Uttar Pradesh')
bihar_df = preprocess(bihar_df, 'Bihar')

# OLS Diagnostic Function
def ols_diagnostics(df, state):
    print(f"\n=== OLS Assumptions for {state} ===")
    X = sm.add_constant(df['Year'])
    y = df['Yield']
    model = sm.OLS(y, X).fit()
    residuals = model.resid
    fitted = model.fittedvalues

    # 1. Linearity
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=df['Year'], y=y, alpha=0.7, color='blue')
    plt.plot(df['Year'], fitted, color='red')
    plt.title(f"{state}: Linearity Check")
    plt.xlabel("Year")
    plt.ylabel("Yield")
    plt.tight_layout()
    plt.savefig(f"{state.lower().replace(' ', '_')}_linearity.png")
    plt.close()

    corr, pval = pearsonr(df['Year'], y)
    print(f"Linearity - Pearson Correlation: {corr:.3f}, p-value: {pval:.3f}")

    # 2. Independence - Durbin-Watson
    dw_stat = durbin_watson(residuals)
    print(f"Independence - Durbin-Watson statistic: {dw_stat:.3f}")

    # 3. Homoscedasticity - Breusch-Pagan
    bp_test = het_breuschpagan(residuals, X)
    print("Homoscedasticity - Breusch-Pagan Test:")
    print(f"  LM stat = {bp_test[0]:.3f}, LM p = {bp_test[1]:.3f}")
    print(f"  F stat = {bp_test[2]:.3f}, F p = {bp_test[3]:.3f}")

    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=fitted, y=residuals, alpha=0.7)
    plt.axhline(0, color='red', linestyle='--')
    plt.title(f"{state}: Residuals vs. Fitted")
    plt.xlabel("Fitted Values")
    plt.ylabel("Residuals")
    plt.tight_layout()
    plt.savefig(f"{state.lower().replace(' ', '_')}_residuals_vs_fitted.png")
    plt.close()

    # 4. Normality - Shapiro-Wilk + Q-Q
    shapiro_stat, shapiro_p = shapiro(residuals)
    print(f"Normality - Shapiro-Wilk: stat={shapiro_stat:.3f}, p-value={shapiro_p:.3f}")

    sm.qqplot(residuals, line='45', fit=True)
    plt.title(f"{state}: Q-Q Plot")
    plt.tight_layout()
    plt.savefig(f"{state.lower().replace(' ', '_')}_qqplot.png")
    plt.close()

    return model

# Run diagnostics for each state
wb_model = ols_diagnostics(wb_df, "West Bengal")
up_model = ols_diagnostics(up_df, "Uttar Pradesh")
bihar_model = ols_diagnostics(bihar_df, "Bihar")



=== OLS Assumptions for West Bengal ===
Linearity - Pearson Correlation: 0.879, p-value: 0.000
Independence - Durbin-Watson statistic: 1.435
Homoscedasticity - Breusch-Pagan Test:
  LM stat = 2.026, LM p = 0.155
  F stat = 2.029, F p = 0.172
Normality - Shapiro-Wilk: stat=0.877, p-value=0.019

=== OLS Assumptions for Uttar Pradesh ===
Linearity - Pearson Correlation: 0.693, p-value: 0.001
Independence - Durbin-Watson statistic: 1.436
Homoscedasticity - Breusch-Pagan Test:
  LM stat = 0.000, LM p = 0.993
  F stat = 0.000, F p = 0.993
Normality - Shapiro-Wilk: stat=0.667, p-value=0.000

=== OLS Assumptions for Bihar ===
Linearity - Pearson Correlation: 0.874, p-value: 0.000
Independence - Durbin-Watson statistic: 2.353
Homoscedasticity - Breusch-Pagan Test:
  LM stat = 0.047, LM p = 0.828
  F stat = 0.042, F p = 0.840
Normality - Shapiro-Wilk: stat=0.964, p-value=0.644
