In [None]:
# Install dependencies if not already installed
# !pip install pandas matplotlib seaborn plotly

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load both datasets
file1 = "/content/Unemployment_Rate_upto_11_2020.csv"
file2 = "/content/Unemployment in India.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Remove any leading/trailing spaces from column names
df1.columns = df1.columns.str.strip()
df2.columns = df2.columns.str.strip()

# Confirm cleaned columns
print("Cleaned columns in Dataset 1:\n", df1.columns)
print("\nCleaned columns in Dataset 2:\n", df2.columns)

# Convert 'Date' to datetime
df1['Date'] = pd.to_datetime(df1['Date'], dayfirst=True, errors='coerce')
df2['Date'] = pd.to_datetime(df2['Date'], dayfirst=True, errors='coerce')

# Drop rows with missing unemployment rates
df1_clean = df1.dropna(subset=['Estimated Unemployment Rate (%)'])
df2_clean = df2.dropna(subset=['Estimated Unemployment Rate (%)'])

# Combine datasets
common_columns = [
    'Region',
    'Date',
    'Estimated Unemployment Rate (%)',
    'Estimated Employed',
    'Estimated Labour Participation Rate (%)'
]

df_combined = pd.concat(
    [df1_clean[common_columns], df2_clean[common_columns]],
    ignore_index=True
)

df_combined = df_combined.sort_values('Date')

# -------------------------------
# National-Level Unemployment Trend
# -------------------------------
national_trend = (
    df_combined
    .groupby('Date')['Estimated Unemployment Rate (%)']
    .mean()
    .reset_index()
)

plt.figure(figsize=(14, 6))
plt.plot(national_trend['Date'], national_trend['Estimated Unemployment Rate (%)'], marker='o')
plt.title("India Monthly Average Unemployment Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Unemployment Rate (%)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Interactive plot
fig = px.line(
    national_trend,
    x="Date",
    y="Estimated Unemployment Rate (%)",
    title="India Monthly Average Unemployment Rate Over Time (Interactive)"
)
fig.show()

# -------------------------------
# COVID-19 Impact Analysis
# -------------------------------
covid_start = pd.Timestamp("2020-03-01")

pre_covid = df_combined[df_combined['Date'] < covid_start]
post_covid = df_combined[df_combined['Date'] >= covid_start]

mean_pre = pre_covid['Estimated Unemployment Rate (%)'].mean()
mean_post = post_covid['Estimated Unemployment Rate (%)'].mean()

print(f"\nMean Unemployment Rate Before COVID-19: {mean_pre:.2f}%")
print(f"Mean Unemployment Rate After COVID-19: {mean_post:.2f}%")

# Bar comparison
plt.figure(figsize=(6, 4))
plt.bar(['Pre-COVID', 'Post-COVID'], [mean_pre, mean_post], color=['green', 'red'])
plt.ylabel('Mean Unemployment Rate (%)')
plt.title('Impact of COVID-19 on Unemployment')
plt.show()

# -------------------------------
# State-Level Trend
# -------------------------------
states_of_interest = ['Andhra Pradesh', 'Tamil Nadu', 'Delhi', 'Maharashtra']

plt.figure(figsize=(15, 8))
for state in states_of_interest:
    state_data = df_combined[df_combined['Region'] == state]
    plt.plot(state_data['Date'], state_data['Estimated Unemployment Rate (%)'], label=state)

plt.title("State-wise Unemployment Rate Trends")
plt.xlabel("Date")
plt.ylabel("Unemployment Rate (%)")
plt.legend()
plt.grid(True)
plt.show()

# -------------------------------
# Monthly Seasonality Patterns
# -------------------------------
df_combined['Month'] = df_combined['Date'].dt.month_name()

monthly_avg = (
    df_combined
    .groupby('Month')['Estimated Unemployment Rate (%)']
    .mean()
    .reindex([
        'January', 'February', 'March', 'April', 'May', 'June',
        'July', 'August', 'September', 'October', 'November', 'December'
    ])
)

plt.figure(figsize=(12, 6))
monthly_avg.plot(kind='bar', color='skyblue')
plt.title('Average Unemployment Rate by Month (Seasonality)')
plt.ylabel('Average Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.show()

# -------------------------------
# Correlation Analysis
# -------------------------------
numeric_cols = df_combined[[
    'Estimated Unemployment Rate (%)',
    'Estimated Employed',
    'Estimated Labour Participation Rate (%)'
]]

plt.figure(figsize=(8, 6))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Between Employment Indicators')
plt.show()

# -------------------------------
# Save cleaned and combined dataset
# -------------------------------
df_combined.to_csv("cleaned_combined_unemployment_data.csv", index=False)

print("\nAnalysis Complete. Cleaned combined dataset saved as 'cleaned_combined_unemployment_data.csv'.")


: 