In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.panel import PanelOLS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# -------------------------------
# Step 1: Load & Prepare Data
# -------------------------------
df = pd.read_csv('glassdoor_reviews.csv', encoding='utf-8', on_bad_lines='skip'))
df['date_review'] = pd.to_datetime(df['date_review'])
df['year'] = df['date_review'].dt.year

# Clean country field
df['location'] = df['location'].astype(str)
df['Country'] = df['location'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else 'Unknown')
df = df[df['Country'] != 'Unknown']  # Remove unknowns

# Define treatment: EU countries
eu_countries = ['England', 'Germany', 'France', 'Netherlands', 'Italy', 'Spain', 'Belgium', 'Austria',
                'Sweden', 'Denmark', 'Ireland', 'Finland', 'Portugal', 'Greece', 'Poland', 'Czech Republic']
df['treatment'] = df['Country'].apply(lambda x: 1 if x in eu_countries else 0)

# GDPR post indicator
df['post_gdpr'] = (df['date_review'] >= '2018-05-25').astype(int)
df['treatment_post'] = df['treatment'] * df['post_gdpr']

# -------------------------------
# Step 2: Parallel Trends Plot
# -------------------------------
parallel_trends = df.groupby(['year', 'treatment'])['overall_rating'].mean().reset_index()
pivot = parallel_trends.pivot(index='year', columns='treatment', values='overall_rating')
pivot.plot(marker='o', title='Parallel Trends in Employee Sentiment (Pre/Post GDPR)')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.show()

# -------------------------------
# Step 3: DiD Regression (OLS)
# -------------------------------
ols_model = ols('overall_rating ~ treatment + post_gdpr + treatment_post + C(Country)',
data=df).fit(cov_type='cluster', cov_kwds={'groups': df['firm']})
print(ols_model.summary())

# -------------------------------
# Step 4: Event Study Pre-Trends
# -------------------------------
# Create relative time dummies
df['relative_year'] = df['year'] - 2018
event_df = df[(df['relative_year'] >= -3) & (df['relative_year'] <= 3)]

for y in range(-3, 4):
    if y != 0:  # Skip base year
        event_df[f'year_{y}'] = ((event_df['relative_year'] == y) & (event_df['treatment'] == 1)).astype(int)

# Build regression formula
event_terms = ' + '.join([f'year_{y}' for y in range(-3, 4) if y != 0])
event_model = ols(
    f'overall_rating ~ {event_terms} + C(Country) + C(year)',
    data=event_df
).fit(cov_type='cluster', cov_kwds={'groups': event_df['firm']})
print(event_model.summary())

# -------------------------------
# Step 5: Two-Way Fixed Effects
# -------------------------------
df = df.set_index(['firm', 'date_review'])
df['year'] = df.index.get_level_values('date_review').year

fixed_effects_model = PanelOLS.from_formula(
    'overall_rating ~ treatment_post + EntityEffects + TimeEffects',
    data=df
)
fe_results = fixed_effects_model.fit()
print(fe_results.summary)




SyntaxError: unmatched ')' (2130190606.py, line 13)

In [None]:
# -------------------------------
# Step 6: Logistic Regression - Predict Compliance Failures
# -------------------------------
# Assuming compliance failure variable is binary: 1 if failure, 0 if not
# And assuming 'overall_rating' is a proxy for sentiment

# Lag sentiment by one period (assume 'year' column is available)
df_reset = df.reset_index()
df_reset = df_reset.sort_values(['firm', 'date_review'])
df_reset['lag_rating'] = df_reset.groupby('firm')['overall_rating'].shift(1)

# Drop NA from lag
logit_df = df_reset.dropna(subset=['lag_rating', 'compliance_failure'])

# Features and outcome
X = logit_df[['lag_rating']]
y = logit_df['compliance_failure']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit logistic regression
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)

# Predictions
y_pred = logit_model.predict(X_test)
print(classification_report(y_test, y_pred))