# Difference in Differences

- Studying the relationship between the number of full-time employees (FTE) in New Jersey (NJ) and the number of FTE in NJ after April 92 given the increase in minimum wage in the state.
- The question is: the increase in minimum wage also increases the unemployment rate for FTE in NJ?

### Importing libraries

In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Scikit-learn library
from sklearn.impute import SimpleImputer

# Statsmodels
import statsmodels.api as sm

# Other libraries
import warnings

### Settings

In [2]:
warnings.filterwarnings('ignore')

### Functions

In [3]:
def df_null_values(dataframe, filter_nulls=True):
    """
    Summary: This function displays the name of the columns that have Null values (total and percentage) of a dataframe
    Args: 
        dataframe: the origin of the data as a Dataframe 
        filter_nulls: if True, only columns with Null values will be displayed
    Returns: Dataframe with name of feature, Null values, Null% values and the type of it
    """
    # Calculating
    null_values = pd.DataFrame(
        {'#Null':  dataframe.isnull().sum(axis=0), 
         '%Null': ( (dataframe.isnull().sum(axis=0)) / (len(dataframe)) )* 100,
         'Type': dataframe.dtypes}
    ).sort_values('#Null', ascending=False)

    # Filtering values
    if filter_nulls:
        null_values = null_values[null_values['#Null'] != 0]

    # Formatting
    null_values['#Null'] = null_values['#Null'].apply(lambda x: f'{x:,.0f}')
    null_values['%Null'] = null_values['%Null'].apply(lambda x: f'{x:.2f}%')

    return null_values

def calculate_statistics(numerical_dataframe):
    """
    Summary: the function calculates the mean, std, min, percentiles [25%, 50% (median), 75%], max, range, skewness and kurtosis
    Args: the DataFrame we want to know more about.
    Return: a DataFrame with the statistics of the numerical variables.
    """
    # Calculating
    statistics = pd.DataFrame({
        'mean':     numerical_dataframe.mean(),
        'std':      numerical_dataframe.std(),
        'min':      numerical_dataframe.min(),
        'Q1':       numerical_dataframe.quantile(0.25),
        'median':   numerical_dataframe.median(),
        'Q3':       numerical_dataframe.quantile(0.75),
        'max':      numerical_dataframe.max(),
        'range':    numerical_dataframe.apply(lambda x: x.max()-x.min()),
        'skewness': numerical_dataframe.skew(),
        'kurtosis': numerical_dataframe.kurtosis()
    })

    return statistics

### Importing data

In [4]:
dataset = pd.read_csv('../data/raw/njmin3.csv')

### Data description

In [5]:
dataset

Unnamed: 0,NJ,POST_APRIL92,NJ_POST_APRIL92,fte,bk,kfc,roys,wendys,co_owned,centralj,southj,pa1,pa2,demp
0,1,0,0,15.00,1,0,0,0,0,1,0,0,0,12.00
1,1,0,0,15.00,1,0,0,0,0,1,0,0,0,6.50
2,1,0,0,24.00,0,0,1,0,0,1,0,0,0,-1.00
3,1,0,0,19.25,0,0,1,0,1,0,0,0,0,2.25
4,1,0,0,21.50,1,0,0,0,0,0,0,0,0,13.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,0,1,0,12.50,0,0,1,0,1,0,0,0,1,-2.50
816,0,1,0,34.00,0,0,1,0,0,0,0,0,1,16.00
817,0,1,0,10.00,0,0,1,0,1,0,0,0,1,-10.25
818,0,1,0,14.00,1,0,0,0,0,0,0,1,0,-1.50


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NJ               820 non-null    int64  
 1   POST_APRIL92     820 non-null    int64  
 2   NJ_POST_APRIL92  820 non-null    int64  
 3   fte              794 non-null    float64
 4   bk               820 non-null    int64  
 5   kfc              820 non-null    int64  
 6   roys             820 non-null    int64  
 7   wendys           820 non-null    int64  
 8   co_owned         820 non-null    int64  
 9   centralj         820 non-null    int64  
 10  southj           820 non-null    int64  
 11  pa1              820 non-null    int64  
 12  pa2              820 non-null    int64  
 13  demp             768 non-null    float64
dtypes: float64(2), int64(12)
memory usage: 89.8 KB


In [7]:
calculate_statistics(dataset)

Unnamed: 0,mean,std,min,Q1,median,Q3,max,range,skewness,kurtosis
NJ,0.807317,0.394647,0.0,1.0,1.0,1.0,1.0,1.0,-1.561236,0.438522
POST_APRIL92,0.5,0.500305,0.0,0.0,0.5,1.0,1.0,1.0,0.0,-2.004896
NJ_POST_APRIL92,0.403659,0.49093,0.0,0.0,0.0,1.0,1.0,1.0,0.393445,-1.849718
fte,21.026511,9.422746,0.0,14.5,20.0,25.5,85.0,85.0,1.301996,4.341915
bk,0.417073,0.493376,0.0,0.0,0.0,1.0,1.0,1.0,0.336983,-1.891061
kfc,0.195122,0.396536,0.0,0.0,0.0,0.0,1.0,1.0,1.541465,0.377027
roys,0.241463,0.428232,0.0,0.0,0.0,0.0,1.0,1.0,1.210413,-0.536214
wendys,0.146341,0.353664,0.0,0.0,0.0,0.0,1.0,1.0,2.004859,2.024393
co_owned,0.343902,0.475299,0.0,0.0,0.0,1.0,1.0,1.0,0.658446,-1.570285
centralj,0.153659,0.360841,0.0,0.0,0.0,0.0,1.0,1.0,1.924327,1.707193


In [8]:
df_null_values(dataset)

Unnamed: 0,#Null,%Null,Type
demp,52,6.34%,float64
fte,26,3.17%,float64


### Filling missing values

In [9]:
# Imputing
missing_values = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fitting
dataset['demp'] = missing_values.fit_transform(dataset[['demp']])
dataset['fte']  = missing_values.fit_transform(dataset[['fte']])

## FTE Analysis

> Analysis of Full-Time Equivalent (FTE) employment in New Jersey (NJ)

### Isolating X and Y variables

In [10]:
X = dataset.iloc[:, :3].values # [NJ, POST_APRIL92, NJ_POST_APRIL92]
Y = dataset.iloc[:, 3].values  # fte

### Creating first model

In [11]:
# Adding constant
X = sm.add_constant(X)

# Fitting the model - OLS(endogenous, exogenous)
model = sm.OLS(Y, X).fit()

In [12]:
# Summary
print(model.summary(
    yname = 'fte', 
    xname = ['intercept', 'New Jersey', 'After April 92', 'NJ After April 92']))

                            OLS Regression Results                            
Dep. Variable:                    fte   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.974
Date:                Tue, 02 Jul 2024   Prob (F-statistic):              0.116
Time:                        21:33:31   Log-Likelihood:                -2986.2
No. Observations:                 820   AIC:                             5980.
Df Residuals:                     816   BIC:                             5999.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
intercept            23.2728      1.04

Considering that our `p-value` is greater than 0.05 for `After April 92` and `NJ After April 92`, we will have to take some further actions, like adding other variables.

In [13]:
# Second model

# Substituting X variable with more variables
X = dataset.loc[:, 
        ['NJ', 'POST_APRIL92', 'NJ_POST_APRIL92', 'bk', 'kfc', 'wendys']].values

In [14]:
# Second model

X = sm.add_constant(X)

model2 = sm.OLS(Y, X).fit()

print(model2.summary(
    yname = 'fte', 
    xname = ['intercept', 'New Jersey', 'After April 92', 'NJ After April 92', 
             'Burguer King', 'KFC', 'Wendys']))

                            OLS Regression Results                            
Dep. Variable:                    fte   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.185
Method:                 Least Squares   F-statistic:                     31.95
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           1.30e-34
Time:                        21:33:31   Log-Likelihood:                -2902.4
No. Observations:                 820   AIC:                             5819.
Df Residuals:                     813   BIC:                             5852.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
intercept            23.4055      1.08

In [15]:
# Third model

X = dataset.loc[:, 
        ['NJ', 'POST_APRIL92', 'NJ_POST_APRIL92', 'bk', 'kfc', 'wendys',
         'co_owned', 'centralj', 'southj']].values

X = sm.add_constant(X)

model3 = sm.OLS(Y, X).fit()

print(model3.summary(
    yname = 'fte', 
    xname = ['intercept', 'New Jersey', 'After April 92', 'NJ After April 92', 
             'Burguer King', 'KFC', 'Wendys','Co-owned', 'Central Jersey', 'South Jersey']))

                            OLS Regression Results                            
Dep. Variable:                    fte   R-squared:                       0.217
Model:                            OLS   Adj. R-squared:                  0.208
Method:                 Least Squares   F-statistic:                     24.89
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           6.45e-38
Time:                        21:33:31   Log-Likelihood:                -2889.1
No. Observations:                 820   AIC:                             5798.
Df Residuals:                     810   BIC:                             5845.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
intercept            23.9321      1.18