In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats


In [None]:
# Read CSV file into DataFrame and drop missing values
education_districtwise = pd.read_csv('Districtwise.csv')
education_districtwise = education_districtwise.dropna()

In [None]:
# Select specific columns ('DISTNAME', 'OVERALL_LI') and print the first 10 rows
education_districtwise = education_districtwise[["DISTNAME", "OVERALL_LI", "TOTPOPULAT"]]
print("Selected Columns (First 10 Rows):")
print(education_districtwise.head(10))

In [None]:
scatter_fig = px.scatter(education_districtwise, x='TOTPOPULAT', y='OVERALL_LI', title='Scatter Plot: TOTPOPULAT vs OVERALL_LI',
                         labels={'TOTPOPULAT': 'Total Population', 'OVERALL_LI': 'Overall Literacy'})
scatter_fig.show()

In [87]:
# Plot a histogram of the 'OVERALL_LI' column with styling options using Plotly
hist_fig = px.histogram(education_districtwise, x='OVERALL_LI', nbins=40, 
                        labels={'OVERALL_LI': 'OVERALL_LI'}, title='Histogram of OVERALL_LI')
hist_fig.update_layout(bargap=0.1, bargroupgap=0.05)
hist_fig.show()

In [None]:
# Plot a cumulative distribution function (CDF) of 'OVERALL_LI' using Plotly
cdf_fig = go.Figure()
cdf_fig.add_trace(go.Scatter(x=np.sort(education_districtwise['OVERALL_LI']),
                             y=np.arange(1, len(education_districtwise) + 1) / len(education_districtwise),
                             mode='lines', name='CDF'))
cdf_fig.update_layout(title='Cumulative Distribution Function (CDF) of OVERALL_LI',
                      xaxis_title='OVERALL_LI', yaxis_title='Cumulative Probability')
cdf_fig.show()

In [None]:
# Calculate mean and standard deviation of 'OVERALL_LI' column
mean_overall_li = education_districtwise['OVERALL_LI'].mean()
std_overall_li = education_districtwise['OVERALL_LI'].std()
print("\nMean of OVERALL_LI:", mean_overall_li)
print("Standard Deviation of OVERALL_LI:", std_overall_li)

<img src="empirical.png" alt="Alt text" width="700"/>


In [None]:
for i in range(1, 4):
    lower_limit = mean_overall_li - i * std_overall_li
    upper_limit = mean_overall_li + i * std_overall_li
    percentage_within_limit = ((education_districtwise['OVERALL_LI'] >= lower_limit) & (education_districtwise['OVERALL_LI'] <= upper_limit)).mean()
    print(f"\nPercentage of data within {i} standard deviations from the mean: {percentage_within_limit:.2%}")

In [None]:
# Calculate Z-scores and add a 'Z_SCORE' column to the DataFrame
education_districtwise['Z_SCORE'] = stats.zscore(education_districtwise['OVERALL_LI'])
print("\nDataFrame with Z-Scores:")
print(education_districtwise)



<img src="outliers.png" alt="Alt text" width="700"/>



In [86]:
# Identify and print rows with Z-scores greater than 3 or less than -3 (potential outliers)
outliers = education_districtwise[(education_districtwise['Z_SCORE'] > 3) | (education_districtwise['Z_SCORE'] < -3)]
print("Rows with Z-Scores beyond 3 standard deviations (potential outliers):")
print(outliers)


Rows with Z-Scores beyond 3 standard deviations (potential outliers):
      DISTNAME  OVERALL_LI  TOTPOPULAT   Z_SCORE
434  DANTEWADA       42.67    532791.0 -3.030890
494  ALIRAJPUR       37.22    728677.0 -3.569821
