In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import requests 
import json 
import math
import random 
random.seed(42)

In [2]:
#Getting data from APIs
response = requests.get("https://api.covid19india.org/state_district_wise.json") #making a request 
data = response.json() #Decoding the JSON data 
zones = requests.get("https://api.covid19india.org/zones.json") 
zone_data = zones.json()

In [3]:
#Parsing the JSON data and putting it in a list 
lyst = []
for i in data:
    for j in data[i]['districtData']:
        lyst.append([j,data[i]['districtData'][j]['confirmed'],data[i]['districtData'][j]['active'], data[i]['districtData'][j]['deceased'],data[i]['districtData'][j]['recovered']])
Dist_stats = pd.DataFrame(lyst, columns = ['District', 'Confirmed','Active','Deceased','Recovered'])

In [4]:
zone_data

{'zones': [{'district': 'Nicobars',
   'districtcode': 'AN_Nicobars',
   'lastupdated': '01/05/2020',
   'source': 'https://www.facebook.com/airnewsalerts/photos/a.262571017217636/1710062729135117/?type=3&theater',
   'state': 'Andaman and Nicobar Islands',
   'statecode': 'AN',
   'zone': 'Green'},
  {'district': 'North and Middle Andaman',
   'districtcode': 'AN_North and Middle Andaman',
   'lastupdated': '01/05/2020',
   'source': 'https://www.facebook.com/airnewsalerts/photos/a.262571017217636/1710062729135117/?type=3&theater',
   'state': 'Andaman and Nicobar Islands',
   'statecode': 'AN',
   'zone': 'Green'},
  {'district': 'South Andaman',
   'districtcode': 'AN_South Andaman',
   'lastupdated': '01/05/2020',
   'source': 'https://www.facebook.com/airnewsalerts/photos/a.262571017217636/1710062729135117/?type=3&theater',
   'state': 'Andaman and Nicobar Islands',
   'statecode': 'AN',
   'zone': 'Red'},
  {'district': 'Anantapur',
   'districtcode': 'AP_Anantapur',
   'lastupda

In [5]:
#Parsing the JSON data and putting it in a dictionary 
zones = {}
for i in zone_data['zones']:
    zones[i['district']] = i['zone']
    

In [6]:
#Extracting the zone data
zone_div = []
for i in Dist_stats['District']:
    if i in zones.keys():
        zone_div.append(zones[i])
    else:
        zone_div.append("No data found")

In [7]:
#Making the zone data a column in the dataframe 
Dist_stats['Zone'] = zone_div

In [8]:
Dist_stats.head(20)

Unnamed: 0,District,Confirmed,Active,Deceased,Recovered,Zone
0,Unassigned,8571,8571,0,0,No data found
1,Nicobars,0,0,0,0,Green
2,North and Middle Andaman,1,0,0,1,Green
3,South Andaman,32,0,0,32,Red
4,Unknown,2,2,0,0,No data found
5,Foreign Evacuees,197,176,0,21,No data found
6,Anantapur,366,186,5,175,Orange
7,Chittoor,331,104,4,223,Red
8,East Godavari,301,134,4,163,Orange
9,Guntur,584,160,9,415,Red


In [9]:
response3 = requests.get("https://livingatlas.esri.in/server/rest/services/LivingAtlas/IND_Demography/MapServer/0/query?where=1%3D1&outFields=distcode,distname,livingatlas.sde.IND_DIST_Demography.area,no_hh,tot_p,tot_m,tot_f,p_06,m_06,f_06,p_sc,m_sc,f_sc,p_st,m_st,f_st,p_lit,m_lit,f_lit,p_ill,m_ill,f_ill,tot_work_p,tot_work_m,non_work_f,non_work_m,objectid&returnGeometry=false&outSR=4326&f=json")
population = response3.json()

In [10]:
population 

{'displayFieldName': 'statename',
 'fieldAliases': {'distcode': 'District Code',
  'distname': 'District Name',
  'livingatlas.sde.IND_DIST_Demography.area': 'Area',
  'no_hh': 'No of Households',
  'tot_p': 'Total Population Person',
  'tot_m': 'Total Population Male',
  'tot_f': 'Total Population Female',
  'p_06': 'Population in the age group 0-6 Person',
  'm_06': 'Population in the age group 0-6 Male',
  'f_06': 'Population in the age group 0-6 Female',
  'p_sc': 'Scheduled Castes population Person',
  'm_sc': 'Scheduled Castes population Male',
  'f_sc': 'Scheduled Castes populationFemale',
  'p_st': 'Scheduled Tribes population Person',
  'm_st': 'Scheduled Tribes population Male',
  'f_st': 'Scheduled Tribes population Female',
  'p_lit': 'Literates Population Person',
  'm_lit': 'Literates Population Male',
  'f_lit': 'Literates Population Female',
  'p_ill': 'Illiterate Persons',
  'm_ill': 'Illiterate Male',
  'f_ill': 'Illiterate Female',
  'tot_work_p': 'Total Worker Popul

In [11]:
#Parsing the JSON data and putting it in a dictionary 
demographic_data = {}
for i in population['features']:
    demographic_data[i['attributes']['distname']] = [i['attributes']['p_ill'],i['attributes']['p_lit'],i['attributes']['tot_p'],i['attributes']['tot_work_p'],i['attributes']['livingatlas.sde.IND_DIST_Demography.area']]
    

In [12]:
#Preparing the data so that it can be put in datamodels and be added as columns 
illiterate = []
literate = []
total_population = []
employed_population = [] 
area = [] 
for i in Dist_stats['District']:
    if i in demographic_data.keys():
        illiterate.append(demographic_data[i][0])
        literate.append(demographic_data[i][1])
        total_population.append(demographic_data[i][2])
        employed_population.append(demographic_data[i][3])
        area.append(demographic_data[i][4])
        
    else: #appending with 0, will later be changed with a random value between mean and std
        illiterate.append(0)
        literate.append(0)
        total_population.append(0)
        employed_population.append(0)
        area.append(0)
        
        
        
        

In [13]:
Dist_stats['literate_population'] = literate

In [14]:
Dist_stats['Illiterate_population'] = illiterate

In [15]:
Dist_stats['Total_Population'] = total_population

In [16]:
Dist_stats['Employed_population'] = employed_population

In [17]:
Dist_stats['Area'] = area

In [18]:
Dist_stats

Unnamed: 0,District,Confirmed,Active,Deceased,Recovered,Zone,literate_population,Illiterate_population,Total_Population,Employed_population,Area
0,Unassigned,8571,8571,0,0,No data found,0.0,0.0,0.0,0.0,0.0
1,Nicobars,0,0,0,0,Green,25332.0,11510.0,36842.0,17125.0,1841.0
2,North and Middle Andaman,1,0,0,1,Green,0.0,0.0,0.0,0.0,0.0
3,South Andaman,32,0,0,32,Red,190266.0,47876.0,238142.0,96831.0,2672.0
4,Unknown,2,2,0,0,No data found,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
764,Purba Bardhaman,127,23,0,104,Orange,0.0,0.0,0.0,0.0,0.0
765,Purba Medinipur,171,90,2,79,Red,3923194.0,1172681.0,5095875.0,1910320.0,4713.0
766,Purulia,83,41,0,42,Green,0.0,0.0,0.0,0.0,0.0
767,South 24 Parganas,331,204,9,118,Orange,0.0,0.0,0.0,0.0,0.0


In [19]:
for i in range(len(Dist_stats.literate_population)):
    if Dist_stats.literate_population[i] == 0:
        Dist_stats.literate_population[i] = np.random.randn()* 200000 + np.mean(Dist_stats.literate_population)

for i in range(len(Dist_stats.Illiterate_population)):
    if Dist_stats.Illiterate_population[i] == 0:
        Dist_stats.Illiterate_population[i] = np.random.randn()* 300000 + np.mean(Dist_stats.literate_population)
        
for i in range(len(Dist_stats.Total_Population)):
    if Dist_stats.Total_Population[i] == 0:
        Dist_stats.Total_Population[i] = np.random.randn() * 500000 + np.mean(Dist_stats.Total_Population)

for i in range(len(Dist_stats.Employed_population)):
    if Dist_stats.Employed_population[i] == 0:
        Dist_stats.Employed_population[i] = np.random.randn() * 200000 + np.mean(Dist_stats.Employed_population)

for i in range(len(Dist_stats.Area)):
    if Dist_stats.Area[i] == 0:
        Dist_stats.Area[i] = np.random.randn() * 1000 + np.mean(Dist_stats.Area)
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import ker

In [20]:
Dist_stats['Percentage_affected'] = Dist_stats['Confirmed']/Dist_stats['Total_Population']*100
Dist_stats['Unemployed_population'] = Dist_stats['Total_Population']-Dist_stats['Employed_population']
Dist_stats['Unemployment_rate'] = (Dist_stats['Unemployed_population']/Dist_stats['Total_Population']) * 100
Dist_stats['Percentage_recovered'] = Dist_stats['Recovered']/Dist_stats['Total_Population']*100
Dist_stats['Death/Total'] = Dist_stats['Deceased']/Dist_stats['Total_Population']
Dist_stats['Density'] = Dist_stats['Total_Population']/Dist_stats['Area']

In [21]:
Dist_stats

Unnamed: 0,District,Confirmed,Active,Deceased,Recovered,Zone,literate_population,Illiterate_population,Total_Population,Employed_population,Area,Percentage_affected,Unemployed_population,Unemployment_rate,Percentage_recovered,Death/Total,Density
0,Unassigned,8571,8571,0,0,No data found,7.003470e+05,9.908391e+05,1.008576e+06,7.053651e+05,3266.638136,0.849812,3.032113e+05,30.063294,0.000000,0.000000e+00,308.750568
1,Nicobars,0,0,0,0,Green,2.533200e+04,1.151000e+04,3.684200e+04,1.712500e+04,1841.000000,0.000000,1.971700e+04,53.517724,0.000000,0.000000e+00,20.011950
2,North and Middle Andaman,1,0,0,1,Green,9.853199e+05,1.268614e+06,1.865139e+06,4.266361e+05,1710.605603,0.000054,1.438503e+06,77.125768,0.000054,0.000000e+00,1090.338215
3,South Andaman,32,0,0,32,Red,1.902660e+05,4.787600e+04,2.381420e+05,9.683100e+04,2672.000000,0.013437,1.413110e+05,59.338966,0.013437,0.000000e+00,89.125000
4,Unknown,2,2,0,0,No data found,1.002318e+06,9.389306e+05,1.279594e+06,5.775418e+05,3113.488223,0.000156,7.020520e+05,54.865224,0.000000,0.000000e+00,410.984001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,Purba Bardhaman,127,23,0,104,Orange,9.338437e+05,8.096408e+05,1.908114e+06,4.856720e+05,3595.729111,0.006656,1.422442e+06,74.547013,0.005450,0.000000e+00,530.661263
765,Purba Medinipur,171,90,2,79,Red,3.923194e+06,1.172681e+06,5.095875e+06,1.910320e+06,4713.000000,0.003356,3.185555e+06,62.512424,0.001550,3.924743e-07,1081.238065
766,Purulia,83,41,0,42,Green,9.875560e+05,5.318192e+05,1.746780e+06,7.082233e+05,4805.017077,0.004752,1.038557e+06,59.455503,0.002404,0.000000e+00,363.532626
767,South 24 Parganas,331,204,9,118,Orange,1.507923e+06,1.165038e+06,1.742847e+06,7.085305e+05,4018.311332,0.018992,1.034316e+06,59.346361,0.006771,5.163966e-06,433.726112


In [22]:
Dist_stats.describe()

Unnamed: 0,Confirmed,Active,Deceased,Recovered,literate_population,Illiterate_population,Total_Population,Employed_population,Area,Percentage_affected,Unemployed_population,Unemployment_rate,Percentage_recovered,Death/Total,Density
count,769.0,769.0,769.0,769.0,743.0,721.0,743.0,721.0,721.0,743.0,721.0,721.0,743.0,743.0,721.0
mean,373.724317,179.620286,10.552666,183.517555,1072936.0,775725.9,1695742.0,693138.7,4520.981864,0.025476,1024880.0,56.342967,0.013555,1e-05,625.230992
std,2580.465307,1368.689941,89.528351,1177.91141,851543.7,519861.4,1260142.0,497599.1,3411.111757,0.257541,827749.7,23.846162,0.171169,0.000174,1765.80122
min,0.0,-943.0,0.0,0.0,9990.0,5346.0,21167.0,10501.0,9.0,0.0,-292073.0,-437.067362,0.0,0.0,3.419548
25%,22.0,6.0,0.0,5.0,581537.0,364730.0,920628.3,369083.0,2559.0,0.002054,449186.8,52.639972,0.000464,0.0,230.370445
50%,69.0,26.0,0.0,32.0,922801.9,694048.0,1440361.0,617392.2,3890.0,0.004969,834905.0,59.112805,0.002006,0.0,366.734003
75%,165.0,71.0,2.0,92.0,1276135.0,1129256.0,2093091.0,878898.0,5187.0,0.010554,1388412.0,65.294708,0.005917,1e-06,609.430943
max,52667.0,27116.0,1857.0,23694.0,8227161.0,3047973.0,11060150.0,4492767.0,38401.0,6.552188,6567381.0,94.80565,4.556778,0.004675,26552.754286


In [23]:
for i in range(len(Dist_stats.Unemployment_rate)):
    if Dist_stats.Unemployment_rate[i] < 0:
        Dist_stats.Unemployment_rate[i] = np.random.randn() * 20 + np.mean(Dist_stats.Unemployment_rate)
for i in range(len(Dist_stats.Unemployed_population)):
    if Dist_stats.Unemployed_population[i] < 0:
        Dist_stats.Unemployed_population[i] = np.random.randn() * 400000 + np.mean(Dist_stats.Unemployed_population)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
response3 = requests.get("https://livingatlas.esri.in/server/rest/services/LivingAtlas/IND_Demography/MapServer/4/query?where=1%3D1&outFields=district_name,total_hhs,num_of_hhs_pay_inc_tax_or_proff,num_of_hhs_mihhml_rs_5000,num_of_hhs_having_mihm_bet_rs_5,num_of_hhs_having_mihhm_rs_1000,lowincome,income_less_10k&returnGeometry=false&outSR=4326&f=json")
incomes = response3.json()

In [25]:
incomes

{'displayFieldName': 'state_name',
 'fieldAliases': {'district_name': 'District Name',
  'total_hhs': 'Total Households',
  'num_of_hhs_pay_inc_tax_or_proff': 'Number of Households Pay Income Tax or Professional Tax',
  'num_of_hhs_mihhml_rs_5000': 'Number of Households having Monthly income of highest earning household member Less than Rs. 5,000',
  'num_of_hhs_having_mihm_bet_rs_5': 'Number of Households having Monthly income of highest earning household member Between Rs. 5,000 and Rs 10,000',
  'num_of_hhs_having_mihhm_rs_1000': 'Number of Households having Monthly income of highest earning household member Rs. 10,000 or more',
  'lowincome': 'lowincome',
  'income_less_10k': 'income_less_10k'},
 'fields': [{'name': 'district_name',
   'type': 'esriFieldTypeString',
   'alias': 'District Name',
   'length': 255},
  {'name': 'total_hhs',
   'type': 'esriFieldTypeInteger',
   'alias': 'Total Households'},
  {'name': 'num_of_hhs_pay_inc_tax_or_proff',
   'type': 'esriFieldTypeInteger'

In [26]:
income_data = {}
for i in incomes['features']:
    income_data[i['attributes']['district_name']] = [i['attributes']['total_hhs'], i['attributes']['num_of_hhs_pay_inc_tax_or_proff'],i['attributes']['income_less_10k']]

In [27]:
income_data

{'Nainital': [117469, 10706, 97131],
 'Dehradun': [165649, 27766, 120763],
 'Almora': [125275, 7087, 112164],
 'Champawat': [44150, 2978, 38476],
 'Uttarkashi': [64836, 3359, 57806],
 'Garhwal': [140122, 12929, 116356],
 'Hardwar': [231792, 11110, 207582],
 'Rudraprayag': [48830, 4082, 41534],
 'Tehri Garhwal': [119766, 5598, 108286],
 'Bageshwar': [53420, 3211, 46667],
 'Pithoragarh': [91906, 5596, 75927],
 'Chamoli': [70323, 4508, 59265],
 'Udham Singh Nagar': [206204, 11393, 179937],
 'Una': [97451, 11460, 73986],
 'Solan': [90043, 10734, 71686],
 'Sirmaur': [82214, 4482, 67176],
 'Kinnaur': [18287, 1867, 13828],
 'Mandi': [198952, 17299, 149608],
 'West Godavari': [881322, 20782, 849508],
 'Kullu': [79798, 4984, 66298],
 'Chamba': [92274, 15653, 74642],
 'Shimla': [121586, 10126, 92473],
 'Bilaspur': [None, None, None],
 'Kangra': [309461, 35438, 228617],
 'Lahul & Spiti': [6177, 564, 4433],
 'Hamirpur': [None, None, None],
 'DATA NOT AVAILABLE': [None, None, None],
 'Anantnag': [1

In [28]:
total_households = []
households_paying_incometax = [] 
low_income_households = [] 
for i in Dist_stats['District']:
    if i in income_data.keys():
        total_households.append(income_data[i][0])
        households_paying_incometax.append(income_data[i][1])
        low_income_households.append(income_data[i][2])
    else:
        total_households.append(0)
        households_paying_incometax.append(0)
        low_income_households.append(0)

In [29]:
Dist_stats['Total_no_of_Households'] = total_households
Dist_stats['No_of_Households_payingTax'] = households_paying_incometax
Dist_stats['No_of_lowIncome_households'] = low_income_households

In [30]:
Dist_stats.describe()

Unnamed: 0,Confirmed,Active,Deceased,Recovered,literate_population,Illiterate_population,Total_Population,Employed_population,Area,Percentage_affected,Unemployed_population,Unemployment_rate,Percentage_recovered,Death/Total,Density,Total_no_of_Households,No_of_Households_payingTax,No_of_lowIncome_households
count,769.0,769.0,769.0,769.0,743.0,721.0,743.0,721.0,721.0,743.0,721.0,721.0,743.0,743.0,721.0,717.0,717.0,717.0
mean,373.724317,179.620286,10.552666,183.517555,1072936.0,775725.9,1695742.0,693138.7,4520.981864,0.025476,1038282.0,57.982327,0.013555,1e-05,625.230992,184943.6,7938.926081,169595.8
std,2580.465307,1368.689941,89.528351,1177.91141,851543.7,519861.4,1260142.0,497599.1,3411.111757,0.257541,820653.7,12.176739,0.171169,0.000174,1765.80122,211260.1,10603.714396,196544.6
min,0.0,-943.0,0.0,0.0,9990.0,5346.0,21167.0,10501.0,9.0,0.0,10666.0,1.12861,0.0,0.0,3.419548,0.0,0.0,0.0
25%,22.0,6.0,0.0,5.0,581537.0,364730.0,920628.3,369083.0,2559.0,0.002054,477343.0,52.933831,0.000464,0.0,230.370445,0.0,0.0,0.0
50%,69.0,26.0,0.0,32.0,922801.9,694048.0,1440361.0,617392.2,3890.0,0.004969,844377.0,59.234003,0.002006,0.0,366.734003,128904.0,4390.0,113326.0
75%,165.0,71.0,2.0,92.0,1276135.0,1129256.0,2093091.0,878898.0,5187.0,0.010554,1402105.0,65.392971,0.005917,1e-06,609.430943,292969.0,11866.0,271233.0
max,52667.0,27116.0,1857.0,23694.0,8227161.0,3047973.0,11060150.0,4492767.0,38401.0,6.552188,6567381.0,94.80565,4.556778,0.004675,26552.754286,1192518.0,76100.0,1142482.0


In [31]:
Dist_stats.Total_no_of_Households[Dist_stats.Total_no_of_Households == 0] = abs(np.random.randn() * 120000 +abs(np.mean(Dist_stats.Total_no_of_Households)))
Dist_stats.No_of_Households_payingTax[Dist_stats.No_of_Households_payingTax == 0] = abs(np.random.randn() * 6500 + abs(np.mean(Dist_stats.No_of_Households_payingTax)))
Dist_stats.No_of_lowIncome_households[Dist_stats.No_of_lowIncome_households == 0] = abs(np.random.randn() *120000 + abs(np.mean(Dist_stats.No_of_lowIncome_households)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [57]:
Dist_stats.describe()

Unnamed: 0,Confirmed,Active,Deceased,Recovered,literate_population,Illiterate_population,Total_Population,Employed_population,Area,Percentage_affected,Unemployed_population,Unemployment_rate,Percentage_recovered,Death/Total,Density,Total_no_of_Households,No_of_Households_payingTax,No_of_lowIncome_households
count,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0
mean,282.28187,132.065156,7.851275,142.328612,1068794.0,774965.8,1693357.0,683364.5,4541.582046,0.023262,1023678.0,57.915589,0.01265,9e-06,488.796312,240238.5,10155.436232,239154.2
std,1553.209069,875.327529,60.634884,708.21865,810476.7,519869.5,1228369.0,479839.5,3400.263487,0.25566,797877.2,12.274805,0.172992,0.000177,522.338189,171639.4,9220.675793,155433.6
min,0.0,-943.0,0.0,0.0,9990.0,10461.0,21167.0,10501.0,30.0,0.0,10666.0,1.12861,0.0,0.0,3.419548,4179.0,94.0,3377.0
25%,26.0,7.0,0.0,6.0,605590.1,364615.2,942778.8,367404.5,2603.065129,0.002191,475353.4,52.744239,0.000539,0.0,228.50748,172478.1,6201.5,169116.8
50%,74.0,27.0,0.0,34.0,926844.5,687740.2,1456658.0,612548.5,3899.0,0.005064,839732.1,59.216069,0.002145,0.0,366.533448,172478.1,6887.186869,216450.3
75%,171.75,73.0,2.0,97.5,1273325.0,1128131.0,2092289.0,874712.0,5186.280159,0.010721,1376012.0,65.390223,0.006073,1e-06,601.027443,292788.5,11847.25,270000.2
max,32022.0,18794.0,1092.0,12245.0,8227161.0,3047973.0,11060150.0,4492767.0,38401.0,6.552188,6567381.0,94.80565,4.556778,0.004675,9258.333333,1192518.0,76100.0,1142482.0


In [33]:
Dist_stats.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Confirmed,Active,Deceased,Recovered,literate_population,Illiterate_population,Total_Population,Employed_population,Area,Percentage_affected,Unemployed_population,Unemployment_rate,Percentage_recovered,Death/Total,Density,Total_no_of_Households,No_of_Households_payingTax,No_of_lowIncome_households
Confirmed,1.0,0.982083,0.923241,0.979399,0.23067,0.0611152,0.186922,0.186803,-0.018333,0.524445,0.190131,0.0272394,0.409106,0.353333,0.514958,0.0756714,0.154051,0.0761497
Active,0.982083,1.0,0.861261,0.924038,0.205719,0.04723,0.166716,0.163411,-0.0301514,0.429826,0.163265,0.0224966,0.29191,0.23597,0.496353,0.0509146,0.121008,0.0548864
Deceased,0.923241,0.861261,1.0,0.945799,0.202056,0.0602974,0.158554,0.163552,-0.00905888,0.691225,0.180084,0.0276284,0.611741,0.573504,0.409448,0.0608799,0.127529,0.0613756
Recovered,0.979399,0.924038,0.945799,1.0,0.250943,0.0744351,0.203732,0.206931,-0.00441909,0.596917,0.213135,0.0314269,0.510536,0.456259,0.520248,0.0978347,0.177361,0.0939389
literate_population,0.23067,0.205719,0.202056,0.250943,1.0,0.643664,0.949135,0.9258,0.257908,0.0157682,0.90563,0.128354,0.0100517,0.000778602,0.346156,0.729421,0.651354,0.714534
Illiterate_population,0.0611152,0.04723,0.0602974,0.0744351,0.643664,1.0,0.745014,0.705382,0.237854,0.0201625,0.73652,0.104229,0.025277,0.0261301,0.0943245,0.62025,0.277521,0.688033
Total_Population,0.186922,0.166716,0.158554,0.203732,0.949135,0.745014,1.0,0.931562,0.291037,-0.0152603,0.969218,0.221871,-0.0194071,-0.0257028,0.293994,0.786513,0.587112,0.774458
Employed_population,0.186803,0.163411,0.163552,0.206931,0.9258,0.705382,0.931562,1.0,0.392739,-0.00171107,0.835468,-0.0765094,-0.00453699,-0.0107952,0.258723,0.784147,0.59498,0.78425
Area,-0.018333,-0.0301514,-0.00905888,-0.00441909,0.257908,0.237854,0.291037,0.392739,1.0,-0.0514586,0.207235,-0.14665,-0.0432392,-0.0403866,-0.184313,0.348142,0.178025,0.33975
Percentage_affected,0.524445,0.429826,0.691225,0.596917,0.0157682,0.0201625,-0.0152603,-0.00171107,-0.0514586,1.0,0.0523258,-0.00931794,0.982938,0.973252,0.135014,-0.0222036,-0.0101289,-0.0108856


In [55]:
count = 0 
for i in Dist_stats.Confirmed:
    count += i 
print(count)

199291


In [56]:
Dist_stats.dropna(inplace = True)
Y = Dist_stats['Percentage_affected']

In [36]:
X = Dist_stats[['Illiterate_population', 'Unemployment_rate', 'No_of_lowIncome_households','No_of_Households_payingTax','Density']]

In [37]:
Dist_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 706 entries, 0 to 768
Data columns (total 20 columns):
District                      706 non-null object
Confirmed                     706 non-null int64
Active                        706 non-null int64
Deceased                      706 non-null int64
Recovered                     706 non-null int64
Zone                          706 non-null object
literate_population           706 non-null float64
Illiterate_population         706 non-null float64
Total_Population              706 non-null float64
Employed_population           706 non-null float64
Area                          706 non-null float64
Percentage_affected           706 non-null float64
Unemployed_population         706 non-null float64
Unemployment_rate             706 non-null float64
Percentage_recovered          706 non-null float64
Death/Total                   706 non-null float64
Density                       706 non-null float64
Total_no_of_Households        706 non-

In [38]:
from scipy import stats 
from sklearn.preprocessing import MinMaxScaler

In [39]:
X_transformed = X.copy()
features = list(X.columns)

In [40]:
for i in features:
    X_transformed[i] = stats.boxcox(X[i])[0]

In [41]:
scaler = MinMaxScaler()
X_transformed[list(X_transformed.columns)] = scaler.fit_transform(X_transformed[list(X_transformed.columns)])

In [42]:
X_transformed

Unnamed: 0,Illiterate_population,Unemployment_rate,No_of_lowIncome_households,No_of_Households_payingTax,Density
0,0.536858,0.092836,0.398774,0.463435,0.347464
1,0.002896,0.306319,0.025225,0.271863,0.094976
2,0.617158,0.652458,0.398774,0.463435,0.531387
3,0.068101,0.379285,0.086932,0.499629,0.212617
4,0.520682,0.322494,0.398774,0.463435,0.384503
...,...,...,...,...,...
764,0.478383,0.608128,0.398774,0.463435,0.419777
765,0.590497,0.422459,0.928569,0.975298,0.529976
766,0.374480,0.380828,0.398774,0.463435,0.368310
767,0.588327,0.379383,0.398774,0.463435,0.391761


In [43]:
from sklearn.model_selection import train_test_split
XT_train, XT_test, YT_train, YT_test = train_test_split(X_transformed, Y, test_size=0.3, random_state=42)

In [44]:
from sklearn.linear_model import LinearRegression
rgr = LinearRegression(normalize=True)
rgr.fit(XT_train, YT_train)
print("model:", rgr)
print("coeffs:", rgr.coef_)
print("intercept:", rgr.intercept_)
YT_predictions = rgr.predict(XT_test)

model: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
coeffs: [ 0.15859382  0.00372073 -0.13994731  0.0510649  -0.14606315]
intercept: 0.043567168354550455


In [45]:
print(YT_predictions)

[ 0.0299107   0.00865513  0.02956544  0.01928176  0.01282003  0.01712347
  0.00887089  0.03711786  0.02332282  0.0333672   0.02299899  0.05116167
  0.02963868  0.02434509  0.02589209 -0.05698262  0.03268489  0.01889663
  0.03169923  0.03370576  0.02494927  0.03785631  0.00231864  0.00885704
  0.00799495  0.02908732  0.02353744  0.01379478 -0.00243829  0.01297356
  0.0180953   0.01784514  0.02249698  0.02367924  0.02551631  0.046732
  0.0180589   0.01465684  0.03431419  0.03945919  0.01840064  0.01140873
  0.03377212  0.03811802  0.02440681  0.01279066  0.03676859 -0.00819753
  0.05304668  0.0091566   0.0219219   0.06235091  0.03672583  0.03537445
  0.02626115  0.00664657  0.03588789  0.00348304  0.01909614  0.00961403
  0.032897    0.0171539   0.04808831  0.0333672  -0.02881103  0.02588953
  0.00782785  0.06084616  0.02801247  0.01468756  0.00483483  0.06753982
  0.03046444  0.08656186  0.04405766  0.03120081  0.05542116  0.01297439
  0.01103998  0.02058157  0.02867333  0.03287191  0.0

In [46]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print(mean_squared_error(YT_test, YT_predictions))
print(r2_score(YT_test, YT_predictions))

0.0011510650514494675
-1.6917163171920695


In [47]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0,splitter='random',criterion='mse',max_depth=None)
regressor.fit(XT_train, YT_train)
YT_predictions2 = regressor.predict(XT_test)

In [48]:
YT_predictions2

array([2.99711444e-03, 6.38222563e-03, 8.38841727e-04, 1.42904735e-02,
       3.83689739e-03, 7.43474156e-04, 4.26042057e-03, 4.28439593e-02,
       3.53963506e-03, 2.64778440e-02, 7.96288551e-03, 4.99167279e-03,
       5.69715168e-02, 1.95261524e-03, 6.94636581e-03, 0.00000000e+00,
       4.06925878e-03, 3.09933286e-03, 3.44427132e-03, 3.48578500e-02,
       1.07628228e-03, 7.57378380e-03, 6.69389517e-03, 2.36872525e-03,
       1.67977894e-02, 1.25969635e-02, 3.41633509e-03, 2.98062593e-03,
       7.66088300e-03, 7.83231024e-03, 7.61295146e-05, 1.14851821e-02,
       1.14412306e-03, 7.61295146e-05, 6.99383843e-03, 1.12182162e-01,
       2.99711444e-03, 3.49882936e-02, 1.58174277e-02, 1.05014600e-02,
       6.94636581e-03, 5.00699589e-03, 1.99534420e-02, 5.22211099e-03,
       7.43474156e-04, 7.51972260e-03, 5.75872103e-05, 9.98891231e-04,
       8.66194379e-04, 1.13331712e-02, 2.54209067e-03, 3.73816516e-04,
       3.83517280e-02, 1.72743859e-02, 1.37565074e-03, 9.51357017e-03,
      

In [49]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print(mean_squared_error(YT_test, YT_predictions2))
print(r2_score(YT_test, YT_predictions2))

0.4167234854046546
-973.4900203584223


In [50]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes=(30, 30,30), random_state=42, alpha = 0.0005,solver='sgd',verbose=True, learning_rate='adaptive')
model.fit(XT_train, YT_train)
YT_predictions3 = model.predict(XT_test)

Iteration 1, loss = 0.08304635
Iteration 2, loss = 0.07906074
Iteration 3, loss = 0.07326817
Iteration 4, loss = 0.06733277
Iteration 5, loss = 0.06186348
Iteration 6, loss = 0.05715939
Iteration 7, loss = 0.05360438
Iteration 8, loss = 0.05106513
Iteration 9, loss = 0.04928593
Iteration 10, loss = 0.04824362
Iteration 11, loss = 0.04773600
Iteration 12, loss = 0.04743674
Iteration 13, loss = 0.04735944
Iteration 14, loss = 0.04731484
Iteration 15, loss = 0.04737446
Iteration 16, loss = 0.04741826
Iteration 17, loss = 0.04743443
Iteration 18, loss = 0.04744975
Iteration 19, loss = 0.04743716
Iteration 20, loss = 0.04741247
Iteration 21, loss = 0.04740291
Iteration 22, loss = 0.04736129
Iteration 23, loss = 0.04734807
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Setting learning rate to 0.000200
Iteration 24, loss = 0.04732096
Iteration 25, loss = 0.04731314
Iteration 26, loss = 0.04731768
Iteration 27, loss = 0.04730945
Iteration 28, loss = 0.04730645

In [51]:
YT_predictions3

array([ 0.05519917, -0.02587141,  0.0557545 , -0.01142484,  0.02235074,
        0.04138854, -0.00262676,  0.11952043,  0.00568972, -0.02306218,
        0.00461498,  0.06624984,  0.03272092,  0.09278982,  0.02536142,
       -0.1154343 ,  0.05094822,  0.0477542 ,  0.04369865,  0.0612414 ,
        0.05518144,  0.08220433, -0.038085  ,  0.02855787, -0.05596709,
        0.03290848,  0.00483011,  0.01318567, -0.03223114, -0.01706574,
        0.05222521,  0.02520233,  0.05936062,  0.02798943, -0.00736576,
        0.03269827,  0.07797933,  0.04003785,  0.03105027,  0.00503888,
       -0.00865551,  0.06014145,  0.01711671,  0.0857384 ,  0.10155808,
        0.0501656 , -0.04404813, -0.05356226,  0.04369824,  0.05715479,
        0.0039144 , -0.01143502,  0.05406705,  0.01034521,  0.08625439,
       -0.0510517 ,  0.07925152,  0.02340494, -0.04297293, -0.02404508,
       -0.01473785,  0.06196695, -0.0072979 , -0.02306218, -0.0390173 ,
        0.01203916, -0.05482264,  0.0905656 ,  0.01608409, -0.02

In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print(mean_squared_error(YT_test, YT_predictions3))
print(r2_score(YT_test, YT_predictions3))

0.003070568350491753
-6.180392560493817
