<a href="https://colab.research.google.com/github/aronyo24/Research-paper/blob/main/Impact_of_Digitalization_in_land_record_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import ttest_ind

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Impact_of_Digitalization_lr.csv")
df

In [None]:
# Step 1: Internet Access Summary
internet_access_summary = df['internet_access'].value_counts().rename(index={0: "No Internet", 1: "Has Internet"})

# Step 2: Age Grouping
def age_group(age):
    if age < 30:
        return 'Young (<30)'
    elif 30 <= age <= 50:
        return 'Middle-aged (30-50)'
    else:
        return 'Older (>50)'

df['age_group'] = df['Age'].apply(age_group)
age_group_summary = df['age_group'].value_counts()

# Step 3: Build full summary dictionary
full_summary = {
    "Total Respondents": len(df),
    "Internet Access": internet_access_summary.to_dict(),
    "Age Groups": age_group_summary.to_dict(),
    "Gender (0=Female, 1=Male)": df['Gender'].value_counts().to_dict(),
    "Location (Urban/Rural)": df['location'].value_counts().to_dict(),
    "Education Level (0=Uneducated, 1=Secondary, 2=Higher)": df['education'].value_counts().to_dict(),
    "Phone Access (0=No, 1=Yes)": df['phone_type'].value_counts().to_dict(),
    "Laptop Access (0=No, 1=Yes)": df['laptop_access'].value_counts().to_dict(),
    "Satisfaction (0=No, 1=Yes)": df['citizen_satisfaction'].value_counts().to_dict()
}

# Step 4: Convert summary to DataFrame for display or export
summary_df = pd.DataFrame(list(full_summary.items()), columns=['Category', 'Value'])

# Display the DataFrame
#print(summary_df)

In [None]:
summary_df

In [None]:
df.columns.tolist()

In [None]:
df.info(), df.head()

In [None]:
df['Treatment'] = df['service_type']

#  Post-digitalization period -------- service_days
df['Post'] = (df['service_days'] <=39 ).astype(int)

# DID Interaction Term
df['Treatment_Post'] = df['Treatment'] * df['Post']





In [None]:

# Define control variables explicitly
control_vars = ['Age', 'Gender', 'education', 'phone_type', 'laptop_access','internet_access',"app_submission_times",
                'app_cost',"e_mutation_cost",'gov_office_visits','cash_transaction','awareness_circulars', 'awareness_complaints','citizen_satisfaction']

# Construct formula string
controls_formula = ' + '.join(control_vars)


formula = f"service_days ~ Treatment + Post + Treatment_Post + {controls_formula}"

# Fit the model
model = smf.ols(formula=formula, data=df).fit()

# Show summary
print(model.summary())

In [None]:
coefs = model.params
conf_int = model.conf_int()
conf_int.columns = ['Lower Bound', 'Upper Bound']


coef_df = pd.DataFrame({'Coefficient': coefs, 'Lower Bound': conf_int['Lower Bound'], 'Upper Bound': conf_int['Upper Bound']})
coef_df = coef_df.drop('Intercept')  #
coef_df = coef_df.sort_values(by="Coefficient", ascending=False)

colors = ['red' if coef > 0 else 'blue' for coef in coef_df['Coefficient']]


plt.figure(figsize=(10, 6))
bars = plt.barh(coef_df.index, coef_df['Coefficient'], color=colors, xerr=[coef_df['Coefficient'] - coef_df['Lower Bound'], coef_df['Upper Bound'] - coef_df['Coefficient']])
plt.xlabel("Effect on Service Days")
plt.title("Impact of Different Factors on Service Time")
plt.axvline(x=0, color="black", linestyle="dashed")
plt.gca().invert_yaxis()


for bar, coef in zip(bars, coef_df['Coefficient']):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f"{coef:.2f}", ha='left' if coef > 0 else 'right', va='center', fontsize=10)

plt.grid(True)

plt.savefig("Different_Fac_Service_time_1.pdf", format='pdf', bbox_inches='tight')

In [None]:
model = smf.ols(formula='gov_office_visits ~  service_days', data=df).fit()
print(model.summary())

# Step 2: Visualize with regression line
plt.figure(figsize=(8, 5))
sns.set_style("whitegrid")  # Clean background
sns.regplot(
    x='service_days',
    y='gov_office_visits',
    data=df,
    scatter_kws={'alpha': 0.6, 'color': '#4682B4'},  # steelblue
    line_kws={"color": "#FF6347", "linewidth": 2}   # tomato red regression line
)
plt.title("Impact of Service Days on Government Office Visits", fontsize=14, fontweight='bold', color='#333333')
plt.xlabel("Service Days", fontsize=12)
plt.ylabel("Gov Office Visits", fontsize=12)
plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.savefig("Impact_Service_time_on_Visit.pdf", format='pdf', bbox_inches='tight')

In [None]:
did_plot = df.groupby(['Treatment', 'Post'])['service_days'].mean().reset_index()

did_plot['System'] = did_plot['Treatment'].map({
    0: 'Manual',
    1: 'Digital'
})


did_plot['Period'] = did_plot['Post'].map({
    0: 'Before',
    1: 'After'
})

did_plot = did_plot.groupby(['Period', 'System'])['service_days'].mean().reset_index()


did_pivot = did_plot.pivot(index='Period', columns='System', values='service_days')

bg_color = '#e9ede9'
grid_color = '#6d6e6d'


fig, ax = plt.subplots(figsize=(8, 5))
sns.set_style("white")

# Change 1: Iterate over did_pivot instead of bribe_pivot
for period in did_pivot.index:
    for status in did_pivot.columns:
        value = did_pivot.loc[period, status]
        plt.text(x=period, y=value - 1.3, s=f"{value:.1f}", ha='center', va='bottom', fontsize=10)

ax.set_facecolor(bg_color)

sns.lineplot(data=did_pivot, markers=True, dashes=False, linewidth=2.2, ax=ax)

ax.set_facecolor(bg_color)
ax.set_title("Effect of Digitalization on Service Time", fontsize=13, color='#333333')
ax.set_ylabel("Average Service Days")
ax.set_xlabel("Time Period (Before vs After Digitalization)")
ax.axvline(x=0.5, color="black", linestyle="dashed", linewidth=1.2, label="Digitalization Begins")
ax.legend(title="System Type")


ax.grid(True, linestyle='--', linewidth=0.7, color=grid_color, alpha=0.9)

plt.tight_layout()
plt.savefig("Effect_dig_on_Service_time_3.pdf", format='pdf', bbox_inches='tight')

In [None]:

df.rename(columns={'cash_transaction ': 'cash_transaction'}, inplace=True)

df['Treatment'] = df['service_type']
df['Post'] = (df['service_days'] <= 39).astype(int)

# Grouped averages
bribe_plot = df.groupby(['cash_transaction', 'Post'])['service_days'].mean().reset_index()

# Map to ethical and time labels
bribe_plot['Bribe Payment Status'] = bribe_plot['cash_transaction'].map({
    0: 'Did Not Pay Bribe',
    1: 'Paid Bribe Before'
})
bribe_plot['Digitalization Period'] = bribe_plot['Post'].map({
    0: 'Before Digitalization',
    1: 'After Digitalization'
})

bg_color = '#e9ede9'       # Light blue background
grid_color = '#6d6e6d'
# Create pivot for plotting, using 'Digitalization Period' as the index
bribe_pivot = bribe_plot.pivot(index='Digitalization Period', columns='Bribe Payment Status', values='service_days')

# Plot setup
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid", font_scale=1.1)
plot = sns.lineplot(data=bribe_pivot, markers=True, dashes=False, linewidth=2.5)

# Add labels to points
for period in bribe_pivot.index:
    for status in bribe_pivot.columns:
        value = bribe_pivot.loc[period, status]
        plt.text(x=period, y=value + 0.7, s=f"{value:.1f}", ha='center', va='bottom', fontsize=10)
ax.set_facecolor(bg_color)

plot.set_facecolor(bg_color)
plt.title("Impact of Bribe Payments on Service Processing Time\nBefore and After Digitalization", fontsize=15, weight='bold')
plt.ylabel("Average Service Days", fontsize=12)
plt.xlabel("Time Period", fontsize=12)
plt.axvline(x=0.5, color="black", linestyle="dashed", label="Digitalization Point")
plt.legend(title="Payment Type", loc='upper left')
plt.grid(True, linestyle='--', linewidth=0.7, color=grid_color, alpha=0.9)
plt.tight_layout()
plt.savefig("Effect_Bribe_Payments_on_Service_time.pdf", format='pdf', bbox_inches='tight')

In [None]:

# Prepare the data
df['Treatment'] = df['service_type']
df['Post'] = (df['service_days'] <= 39).astype(int)

# Group service_days by satisfaction and digitalization period
satisfaction_plot = df.groupby(['citizen_satisfaction', 'Post'])['service_days'].mean().reset_index()

# Map readable labels
satisfaction_plot['Satisfaction'] = satisfaction_plot['citizen_satisfaction'].map({
    0: 'Not Satisfied',
    1: 'Satisfied'
})
satisfaction_plot['Period'] = satisfaction_plot['Post'].map({
    0: 'Before Digitalization',
    1: 'After Digitalization'
})

# Background and grid colors
bg_color = '#e9ede9'       # Light blue background
grid_color = '#6d6e6d'

# Pivot table for plotting
satisfaction_pivot = satisfaction_plot.pivot(index='Period', columns='Satisfaction', values='service_days')

# Plot setup
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid", font_scale=1.1)

# Line plot
plot = sns.lineplot(data=satisfaction_pivot, markers=True, dashes=False, linewidth=2.5)

# Data labels
for period in satisfaction_pivot.index:
    for status in satisfaction_pivot.columns:
        value = satisfaction_pivot.loc[period, status]
        plt.text(x=period, y=value + 0.7, s=f"{value:.1f}", ha='center', va='bottom', fontsize=10)

# Titles and labels
plot.set_facecolor(bg_color)
plt.title("Impact of Citizen Satisfaction on Service Time\nBefore and After Digitalization", fontsize=15, weight='bold')
plt.ylabel("Average Service Days", fontsize=12)
plt.xlabel("Time Period", fontsize=12)

# Digitalization line
plt.axvline(x=0.5, color="black", linestyle="dashed", label="Digitalization Point")

# Set legend to LEFT
plt.legend(title="Satisfaction Status", loc='upper left')

# Final touches
plt.grid(True, linestyle='--', linewidth=0.7, color=grid_color, alpha=0.9)
plt.tight_layout()
plt.savefig("ImpactCitizen_Satisfaction_on_Service_time.pdf", format='pdf', bbox_inches='tight')


In [None]:
# Final regression formula
control_vars = ['Age', 'Gender', 'education', 'phone_type', 'laptop_access','internet_access',
                'app_submission_times', 'app_cost', 'e_mutation_cost', 'service_days',
                'cash_transaction','awareness_circulars', 'awareness_complaints', 'citizen_satisfaction']
 # Construct formula string
controls_formula = ' + '.join(control_vars)

formula = f"gov_office_visits ~ Treatment + Post + Treatment_Post + {controls_formula}"

# Fit the model
model = smf.ols(formula=formula, data=df).fit()

# Show summary
print(model.summary())

In [None]:
coefs = model.params
conf_int = model.conf_int()
conf_int.columns = ['Lower Bound', 'Upper Bound']


coef_df = pd.DataFrame({'Coefficient': coefs, 'Lower Bound': conf_int['Lower Bound'], 'Upper Bound': conf_int['Upper Bound']})
coef_df = coef_df.drop('Intercept')  #
coef_df = coef_df.sort_values(by="Coefficient", ascending=False)

#positive = red, negative = blue
colors = ['red' if coef > 0 else 'blue' for coef in coef_df['Coefficient']]


plt.figure(figsize=(10, 6))
bars = plt.barh(coef_df.index, coef_df['Coefficient'], color=colors, xerr=[coef_df['Coefficient'] - coef_df['Lower Bound'], coef_df['Upper Bound'] - coef_df['Coefficient']])
plt.xlabel("Effect on gov office visits")
plt.title("Impact of Different Factors on Visit")
plt.axvline(x=0, color="black", linestyle="dashed")
plt.gca().invert_yaxis()

for bar, coef in zip(bars, coef_df['Coefficient']):
    offset = 0.05 if coef > 0 else -0.05
    align = 'left' if coef > 0 else 'right'
    plt.text(bar.get_width() + offset, bar.get_y() + bar.get_height()/2,
             f"{coef:.2f}", ha=align, va='center', fontsize=9)

plt.savefig("Different_Fac_visit.pdf", format='pdf', bbox_inches='tight')

In [None]:

df_analysis = df.groupby(["Treatment", "Treatment_Post"])["gov_office_visits"].mean().reset_index()


df_analysis["Service_Type"] = df_analysis["Treatment"].map({0: "Manual System", 1: "Digital System"})
df_analysis["Digitalization_Period"] = df_analysis["Treatment_Post"].map({0: "Before Digitalization", 1: "After Digitalization"})

plt.figure(figsize=(8, 5))
ax = sns.barplot(
    x="Service_Type",
    y="gov_office_visits",
    hue="Digitalization_Period",
    data=df_analysis,
    palette={"Before Digitalization": "blue", "After Digitalization": "orange"},
    capsize=0.1
)

# Titles and labels
plt.title("Effect of Digital vs Manual System on Government Office Visits")
plt.xlabel("Service Type")
plt.ylabel("Average Number of Office Visits")

# Add a legend
plt.legend(title="Digitalization", fontsize=11)
plt.grid(True)

plt.savefig("gov_office_visits.pdf", format='pdf', bbox_inches='tight')

In [None]:
control_vars = [
    'Age', 'Gender', 'education', 'phone_type', 'gov_office_visits', 'laptop_access', 'internet_access',
    'app_submission_times', 'service_days', 'cash_transaction',
    'awareness_circulars', 'awareness_complaints', 'citizen_satisfaction'
]

# Step 2: Construct control variable string
controls_formula = ' + '.join(control_vars)

# Step 3: Create cost variable (already correct)
df['cost'] = df['app_cost'] + df['e_mutation_cost']

# Step 4: Final regression formula for cost
formula = f"cost ~ Treatment + Post + Treatment_Post + {controls_formula}"


# Fit the model
model = smf.ols(formula=formula, data=df).fit()

# Show summary
print(model.summary())

In [None]:
coefs = model.params
conf_int = model.conf_int()
conf_int.columns = ['Lower Bound', 'Upper Bound']


coef_df = pd.DataFrame({'Coefficient': coefs, 'Lower Bound': conf_int['Lower Bound'], 'Upper Bound': conf_int['Upper Bound']})
coef_df = coef_df.drop('Intercept')  #
coef_df = coef_df.sort_values(by="Coefficient", ascending=False)

#positive = red, negative = blue
colors = ['red' if coef > 0 else 'blue' for coef in coef_df['Coefficient']]


plt.figure(figsize=(10, 6))
bars = plt.barh(coef_df.index, coef_df['Coefficient'], color=colors, xerr=[coef_df['Coefficient'] - coef_df['Lower Bound'], coef_df['Upper Bound'] - coef_df['Coefficient']])
plt.xlabel("Effect on cost")
plt.title("Impact of Different Factors on cost")
plt.axvline(x=0, color="black", linestyle="dashed")
plt.gca().invert_yaxis()


for bar, coef in zip(bars, coef_df['Coefficient']):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f"{coef:.2f}", ha='left' if coef > 0 else 'right', va='center', fontsize=10)

plt.grid(True)

plt.savefig("Different_Fac_Service_cost.pdf", format='pdf', bbox_inches='tight')

In [None]:
df['cost'] = df['app_cost'] + df['e_mutation_cost']

# Step 2: Split by digitalization period
before_cost = df[df['Post'] == 0]['cost']
after_cost = df[df['Post'] == 1]['cost']

# Step 3: Perform Independent T-Test
t_stat, p_value = ttest_ind(before_cost, after_cost, equal_var=False)

# Step 4: Summary for bar plot
cost_comparison = df.groupby('Post')['cost'].agg(['mean', 'count', 'std']).reset_index()
cost_comparison['Period'] = cost_comparison['Post'].map({0: 'Before Digitalization', 1: 'After Digitalization'})
cost_comparison = cost_comparison.drop(columns='Post')

# Step 5: Plot average cost to complete full process
plt.figure(figsize=(8, 6))
sns.barplot(x='Period', y='mean', data=cost_comparison, palette='Set2', edgecolor='black', ci=None)

# Annotate bar values
for index, row in cost_comparison.iterrows():
    plt.text(index, row['mean'] + 5, f"{row['mean']:.1f}", ha='center', fontsize=11)

# Final formatting
plt.title("Average Cost to Complete Full Process\nBefore vs After Digitalization", fontsize=14, weight='bold')
plt.ylabel("Average Total Cost (BDT)")
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig("Cost_Complete_Full_Process.pdf", format='pdf', bbox_inches='tight')

# Step 6: Print t-test results
#print("T-Test Result (Cost to Complete Full Process):")
#print(f"T-Statistic: {t_stat:.3f}")
#print(f"P-Value: {p_value:.4f}")

In [None]:
df['cost'] = df['app_cost'] + df['e_mutation_cost']

# Step 2: Logistic regression model
logit_model = smf.logit(formula='citizen_satisfaction ~ cost', data=df).fit()
print(logit_model.summary())

# Step 3: Predict probabilities across a range of cost values
cost_range = np.linspace(df['cost'].min(), df['cost'].max(), 100)
pred_df = pd.DataFrame({'cost': cost_range})
pred_df['predicted_prob'] = logit_model.predict(pred_df)

# Step 4: Plot predicted satisfaction probability
plt.figure(figsize=(8, 6))
sns.lineplot(x='cost', y='predicted_prob', data=pred_df, color='green', linewidth=2.5)
plt.title("Predicted Probability of Citizen Satisfaction\nBased on Total Cost", fontsize=14, weight='bold')
plt.xlabel("Total Cost (BDT)")
plt.ylabel("Probability of Being Satisfied")
plt.grid(True)
plt.tight_layout()
plt.savefig("Citizen_Satisfaction_Total_Cost.pdf", format='pdf', bbox_inches='tight')

**Training the Model – Finding the Line of Best Fit**


To find the best values for $\beta_0$ and $\beta_1$, we use the least squares method. This method minimizes the error between the predicted and actual values.
\begin{equation}
\hat{\beta}_1 = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sum (x_i - \bar{x})^2}, \quad \hat{\beta}_0 = \bar{y} - \hat{\beta}_1 \bar{x}
\end{equation}