In [1]:
# Import necessary libraries for data analysis and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

In [2]:
# Import Plotly libraries for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import HTML

In [3]:
# Mount Google Drive if running on Google Colab
from google.colab import drive
drive.mount('/content/drive')  # Mount the entire Google Drive

#navigate to the directory where your Iris.csv file is located
import os
os.chdir('/content/drive/MyDrive/Project_O/Task_2')

# Load the CSV file
data = pd.read_csv('Unemployment in India.csv')
data = pd.read_csv('Unemployment_Rate_upto_11_2020.csv')

Mounted at /content/drive


In [4]:
# Display the first few rows of the loaded data
data.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Region.1,longitude,latitude
0,Andhra Pradesh,31-01-2020,M,5.48,16635535,41.02,South,15.9129,79.74
1,Andhra Pradesh,29-02-2020,M,5.83,16545652,40.9,South,15.9129,79.74
2,Andhra Pradesh,31-03-2020,M,5.79,15881197,39.18,South,15.9129,79.74
3,Andhra Pradesh,30-04-2020,M,20.51,11336911,33.1,South,15.9129,79.74
4,Andhra Pradesh,31-05-2020,M,17.43,12988845,36.46,South,15.9129,79.74


In [5]:
# Display the last few rows of the loaded data
data.tail()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Region.1,longitude,latitude
262,West Bengal,30-06-2020,M,7.29,30726310,40.39,East,22.9868,87.855
263,West Bengal,31-07-2020,M,6.83,35372506,46.17,East,22.9868,87.855
264,West Bengal,31-08-2020,M,14.87,33298644,47.48,East,22.9868,87.855
265,West Bengal,30-09-2020,M,9.35,35707239,47.73,East,22.9868,87.855
266,West Bengal,31-10-2020,M,9.98,33962549,45.63,East,22.9868,87.855


In [6]:
# Display basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 9 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    267 non-null    object 
 1    Date                                     267 non-null    object 
 2    Frequency                                267 non-null    object 
 3    Estimated Unemployment Rate (%)          267 non-null    float64
 4    Estimated Employed                       267 non-null    int64  
 5    Estimated Labour Participation Rate (%)  267 non-null    float64
 6   Region.1                                  267 non-null    object 
 7   longitude                                 267 non-null    float64
 8   latitude                                  267 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 18.9+ KB


In [7]:
# Check for missing values in the dataset
data.isnull()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Region.1,longitude,latitude
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
262,False,False,False,False,False,False,False,False,False
263,False,False,False,False,False,False,False,False,False
264,False,False,False,False,False,False,False,False,False
265,False,False,False,False,False,False,False,False,False


In [8]:
# Calculate the sum of missing values for each column
data.isnull().sum()

Region                                      0
 Date                                       0
 Frequency                                  0
 Estimated Unemployment Rate (%)            0
 Estimated Employed                         0
 Estimated Labour Participation Rate (%)    0
Region.1                                    0
longitude                                   0
latitude                                    0
dtype: int64

In [9]:
# Rename columns for better readability
data.columns =['States','Date','Frequency','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate','Region','longitude','latitude']

In [10]:
# Convert the 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'],dayfirst=True)

In [11]:
# Convert the 'States' column to a categorical data type
data['States']= data['States'].astype('category')

In [12]:
# Extract the month and add it as a new column
data['Month'] = data['Date'].dt.month
# Convert the month to an integer value
data['Month_int'] = data['Month'].apply(lambda x: int(x))
# Map the integer month to its abbreviated name
data['Month_name'] =  data['Month_int'].apply(lambda x: calendar.month_abbr[x])
# Remove the original 'Month' column
data.drop(columns='Month', inplace=True)

In [13]:
data['Frequency'] = data['Frequency'].astype('category')

In [14]:
data.head()

Unnamed: 0,States,Date,Frequency,Estimated Unemployment Rate,Estimated Employed,Estimated Labour Participation Rate,Region,longitude,latitude,Month_int,Month_name
0,Andhra Pradesh,2020-01-31,M,5.48,16635535,41.02,South,15.9129,79.74,1,Jan
1,Andhra Pradesh,2020-02-29,M,5.83,16545652,40.9,South,15.9129,79.74,2,Feb
2,Andhra Pradesh,2020-03-31,M,5.79,15881197,39.18,South,15.9129,79.74,3,Mar
3,Andhra Pradesh,2020-04-30,M,20.51,11336911,33.1,South,15.9129,79.74,4,Apr
4,Andhra Pradesh,2020-05-31,M,17.43,12988845,36.46,South,15.9129,79.74,5,May


In [15]:
# Extract statistics for specific columns in the dataset
data_stats = data[['Estimated Unemployment Rate',
       'Estimated Employed', 'Estimated Labour Participation Rate']]


round(data_stats.describe().T,2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Estimated Unemployment Rate,267.0,12.24,10.8,0.5,4.84,9.65,16.76,75.85
Estimated Employed,267.0,13962105.72,13366318.36,117542.0,2838930.5,9732417.0,21878686.0,59433759.0
Estimated Labour Participation Rate,267.0,41.68,7.85,16.77,37.26,40.39,44.06,69.69


In [16]:
# Calculate average statistics for unemployment-related columns grouped by region
region_stats = data.groupby(['Region'])[['Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate']].mean().reset_index()
region_stats = round(region_stats,2)
region_stats

Unnamed: 0,Region,Estimated Unemployment Rate,Estimated Employed,Estimated Labour Participation Rate
0,East,13.92,19602366.9,40.11
1,North,15.89,13072487.92,38.7
2,Northeast,10.95,3617105.53,52.06
3,South,10.45,14040589.33,40.44
4,West,8.24,18623512.72,41.26


In [17]:
# Create a box plot to visualize the unemployment rate across different states
fig = px.box(data,x='States',y='Estimated Unemployment Rate',color='States',title='Unemployment rate',template='plotly')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [18]:
# Create a bar plot to show the average unemployment rate in each state
plot_ump = data[['Estimated Unemployment Rate','States']]
du_unemp = plot_ump.groupby('States').mean().reset_index()
du_unemp = du_unemp.sort_values('Estimated Unemployment Rate')
fig = px.bar(du_unemp, x='States',y='Estimated Unemployment Rate',color='States',
            title='Average Unemployment Rate in each state',template='plotly')
fig.show()

In [19]:
# Create an animated bar plot to visualize unemployment rate across regions over time
fig = px.bar(data, x='Region',y='Estimated Unemployment Rate',animation_frame = 'Month_name',color='States',
            title='Unemployment rate across region from Jan.2020 to Oct.2020', height=700,template='plotly')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000
fig.show()

In [20]:
# Create a sunburst plot to visualize unemployment rate in each region and state
unemplo_du = data[['States','Region','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate']]
unemplo = unemplo_du.groupby(['Region','States'])['Estimated Unemployment Rate'].mean().reset_index()

In [21]:
fig = px.sunburst(unemplo, path=['Region','States'], values='Estimated Unemployment Rate',
                  color_continuous_scale='Plasma',title= 'unemployment rate in each region and state',
                  height=400,template='ggplot2')


fig.show()