In [None]:
!pip install linearmodels



In [None]:
##Import necessary packages
import numpy as np  # Useful for math calculations
import pandas as pd  # Useful for data manipulation
import statsmodels.api as sm  # Useful for regression analysis
import statsmodels.formula.api as smf  # Required for regression using formulas
import matplotlib.pyplot as plt  # Useful for plotting
import linearmodels.panel as plm  # Required for panel data models

#Load Data
nls_df = pd.read_csv("/content/nls_panel.csv")
nls_df.head()
print(nls_df.head())

# Define the regression model variables
nls_df["exper2"] = nls_df["exper"] ** 2

#(a) Pooled OLS Regression
pooled_ols_model = smf.ols("lwage ~ exper + exper2 + south + union", data=nls_df).fit()
print("Pooled OLS Results:\n", pooled_ols_model.summary())

#(b) Fixed Effects using Mean Differencing
nls_df = nls_df.copy()

for var in ["lwage", "exper", "exper2", "south", "union"]:
    nls_df[f"{var}_mean"] = nls_df.groupby("id")[var].transform("mean")
    nls_df[f"{var}_fe"] = nls_df[var] - nls_df[f"{var}_mean"]

fixed_effects_model = smf.ols("lwage_fe ~ exper_fe + exper2_fe + south_fe + union_fe - 1", data=nls_df).fit()
print("\nFixed Effects Model Results:\n", fixed_effects_model.summary())

#(c) First Differencing for years 1987-1988
nls_87_88 = nls_df[nls_df["year"].isin([1987, 1988])].copy()
nls_87_88 = nls_87_88.sort_values(by=["id", "year"])

#Apply first differencing
for var in ["lwage", "exper", "exper2", "south", "union"]:
    nls_87_88[f"{var}_fd"] = nls_87_88.groupby("id")[var].diff()

#Check the number of non-NaN observations
print("Before dropping NaNs:", len(nls_87_88))
nls_87_88 = nls_87_88.dropna()  #Remove NaN values after differencing
print("After dropping NaNs:", len(nls_87_88))

# Ensure there are enough observations before running regression
if not nls_87_88.empty:
    fd_model = smf.ols("lwage_fd ~ exper_fd + exper2_fd + south_fd + union_fd - 1", data=nls_87_88).fit()
    print("\nFirst Differencing Model Results:\n", fd_model.summary())
else:
    print("\nError: No valid observations left after first differencing. Check data for missing values.")
# (d) Random Effects Estimation
import linearmodels.panel as plm

#Make sure data is properly formatted for panel regression
nls_df = nls_df.set_index(["id", "year"])

re_model = plm.RandomEffects.from_formula("lwage ~ exper + exper2 + south + union", data=nls_df).fit()
print("\nRandom Effects Model Results:\n", re_model.summary)




   id  year     lwage  hours  age  educ  collgrad  msp  nev_mar  not_smsa  \
0   1    82  1.808289     38   30    12         0    1        0         0   
1   1    83  1.863417     38   31    12         0    1        0         0   
2   1    85  1.789367     38   33    12         0    0        0         0   
3   1    87  1.846530     40   35    12         0    0        0         0   
4   1    88  1.856449     40   37    12         0    0        0         0   

   c_city  south  black  union      exper     exper2    tenure    tenure2  
0       1      0      1      1   7.666667   58.77777  7.666667  58.777770  
1       1      0      1      1   8.583333   73.67361  8.583333  73.673610  
2       1      0      1      1  10.179490  103.62200  1.833333   3.361111  
3       1      0      1      1  12.179490  148.33990  3.750000  14.062500  
4       1      0      1      1  13.621790  185.55330  5.250000  27.562500  
Pooled OLS Results:
                             OLS Regression Results          

Question 1

c) The differences between the Pooled OLS and Fixed Effects OLS are that in the Pooled OLS, unobserved individual effects are not correlated with regressors, and the Fixed Effects model, individual effecs are correlated with the regressors

d) Random Effects estimators should be used when unobserved effects are not correlated with regressors. Fixed effects estimators should be used when unobserved effects are not correlated with regressors. The Hausman Test can be used to determine which estimator should be used.

In [None]:
## Import necessary packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load Data
smoking_df = pd.read_excel("/content/Smoking.xlsx")
smoking_df.head()
print(smoking_df.head())

# (a) Linear Probability Model (OLS)
lpm_model = smf.ols("smoker ~ smkban + female + age + hsgrad + colgrad", data=smoking_df).fit()
print("\nLinear Probability Model (OLS) Results:\n", lpm_model.summary())

# (b) Probit Model
probit_model = smf.probit("smoker ~ smkban + female + age + hsgrad + colgrad", data=smoking_df).fit()
print("\nProbit Model Results:\n", probit_model.summary())

# (c) Logit Model
logit_model = smf.logit("smoker ~ smkban + female + age + hsgrad + colgrad", data=smoking_df).fit()
print("\nLogit Model Results:\n", logit_model.summary())



  warn(msg)


   smoker  smkban  age  hsdrop  hsgrad  colsome  colgrad  black  hispanic  \
0       1       1   41       0       1        0        0      0         0   
1       1       1   44       0       0        1        0      0         0   
2       0       0   19       0       0        1        0      0         0   
3       1       0   29       0       1        0        0      0         0   
4       0       1   28       0       0        1        0      0         0   

   female  
0       1  
1       1  
2       1  
3       1  
4       1  

Linear Probability Model (OLS) Results:
                             OLS Regression Results                            
Dep. Variable:                 smoker   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     60.23
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           5.03e-62
Time:                       

Question 2

a)

* B_1: .3382 is the probability of smoking when all x variables are 0
* B_2: smoking ban reduces probability of smoking by 6.05%
* B_3: women are 2.8% less likely to smoke
* B_4: each additional year age increase reduces probability of smoking by 0.13%
* B_5: High school graduates are 7.65% more likely to smoke than non-graduates
* B_6: College graduates are 9.75% less likely to smoke than non graduates



b)

* B_1: baseline effect on the lantent smoking tendecy
* B_2: negative coefficient meaning smoking ban reduce likelihood of smoking
* B_3: negative coefficient meaning women are less likeky to smoke
* B_4: negative coefficient meaning older people are less likely to smoke
* B_5: positive coeffiecient meaning hs grads are more likely to smoke
* B_6: negative coefficient meaning college grads are less likely to smoke


c)

* B_1: Baseline log-odds of smoking
* B_2: negative coefficient meaning bans reduce likelihood of smoking
* B_3: negative coefficient meaning women are less likely to smoke
* B_4: negative coefficient meaning older people are less likely to smoke
* B_5: positive coefficient meaning hs grads are more likely to smoke
* B_6: negative coefficient meaning college grads are less likely to smoke
