# For every state, find the date on which the entire population will get at least one does of vaccination

In [1]:
import numpy as np
import pandas as pd
import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# from datetime import datetime
# dateparse = lambda x: datetime.strptime(x, '%d-%m-%Y')

In [3]:
df = pd.read_csv(r'firstandseconddoselatestnew1.csv',parse_dates=['Date'])

In [4]:
df.tail()

Unnamed: 0,State,District,District_Key,dose1,dose2,Date
153814,West Bengal,Uttar Dinajpur,WB_Uttar Dinajpur,510603.0,166508.0,2021-08-10
153815,West Bengal,Uttar Dinajpur,WB_Uttar Dinajpur,513480.0,167422.0,2021-08-11
153816,West Bengal,Uttar Dinajpur,WB_Uttar Dinajpur,518137.0,168836.0,2021-08-12
153817,West Bengal,Uttar Dinajpur,WB_Uttar Dinajpur,529876.0,171224.0,2021-08-13
153818,West Bengal,Uttar Dinajpur,WB_Uttar Dinajpur,540958.0,173055.0,2021-08-14


In [5]:
df.dtypes

State                   object
District                object
District_Key            object
dose1                  float64
dose2                  float64
Date            datetime64[ns]
dtype: object

## FINDING RATE OF VACCINATION FOR EVERY STATE

In [6]:
ind1 = df[df['Date'] == '2021-08-07'].index
without_lastweek = df.iloc[ind1]

In [7]:
ind2 = df[df['Date'] == '2021-08-14'].index
lastweek = df.iloc[ind2]

In [8]:
# without_lastweek
# lastweek

In [9]:
df1= without_lastweek.groupby('State').aggregate({'dose1':sum,'dose2':sum})  #on 7th date
df2= lastweek.groupby('State').aggregate({'dose1':sum,'dose2':sum})    #on last week i.e. on 14thdate

In [10]:
df1.rename(columns={'dose1': 'dose1_x', 'dose2': 'dose2_x'},inplace=True)
df2.rename(columns={'dose1': 'dose1_y', 'dose2': 'dose2_y'},inplace=True)
#dose1_y is number of people vaccinated with dose1

In [11]:
df1.head()

Unnamed: 0_level_0,dose1_x,dose2_x
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Andaman and Nicobar Islands,215021.0,94249.0
Andhra Pradesh,17477120.0,6140096.0
Arunachal Pradesh,9328395000.0,259157484.0
Assam,10189220.0,2150197.0
Bihar,22945520.0,4399665.0


In [12]:
df2.head()

Unnamed: 0_level_0,dose1_y,dose2_y
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Andaman and Nicobar Islands,232709.0,97287.0
Andhra Pradesh,18516140.0,6485212.0
Arunachal Pradesh,9501806000.0,286121142.0
Assam,11430050.0,2442900.0
Bihar,25167380.0,4859267.0


In [13]:
newdf = pd.concat([df1,df2],axis=1)  #concatenating horizontally 

In [14]:
newdf.columns    #columns (on 7th, on 14th)

Index(['dose1_x', 'dose2_x', 'dose1_y', 'dose2_y'], dtype='object')

In [15]:
newdf['dose1'] = (newdf['dose1_y'] - newdf['dose1_x'])/7   #i.e. rate of vaccination
newdf['dose2'] = (newdf['dose2_y'] - newdf['dose2_x'])/7

In [16]:
newdf.reset_index(inplace=True)

In [17]:
newdf.head()

Unnamed: 0,State,dose1_x,dose2_x,dose1_y,dose2_y,dose1,dose2
0,Andaman and Nicobar Islands,215021.0,94249.0,232709.0,97287.0,2526.857,434.0
1,Andhra Pradesh,17477120.0,6140096.0,18516140.0,6485212.0,148431.1,49302.29
2,Arunachal Pradesh,9328395000.0,259157484.0,9501806000.0,286121142.0,24773030.0,3851951.0
3,Assam,10189220.0,2150197.0,11430050.0,2442900.0,177262.3,41814.71
4,Bihar,22945520.0,4399665.0,25167380.0,4859267.0,317408.4,65657.43


In [18]:
newdf['dose1']= newdf['dose1'].apply(np.ceil)
newdf['dose2']= newdf['dose2'].apply(np.ceil)

In [19]:
##combining AP and Telangana into AndhraPradesh(use this)
newdf.iloc[1,1:] = newdf.iloc[1,1:]+newdf.iloc[31,1:]
##combining Jammu and kashmir and Ladakh into Jammu and kashmir
newdf.iloc[13,1:] = newdf.iloc[13,1:]+newdf.iloc[17,1:]

In [20]:
#need not drop TELANGANA and Ladakh as they will not be calculated during merge with census

In [21]:
#dummy dont use
newdf2 = newdf.copy()

In [22]:
#combining AP and Telangana
ts = newdf2[newdf2['State']=='Ladakh'].values
ap = newdf2[newdf2['State']=='Jammu and Kashmir'].values

In [23]:
##combining AP and Telangana (use this)
newdf2.iloc[13,1:] = newdf2.iloc[13,1:]+newdf2.iloc[17,1:]

In [24]:
newdf.head()

Unnamed: 0,State,dose1_x,dose2_x,dose1_y,dose2_y,dose1,dose2
0,Andaman and Nicobar Islands,215021.0,94249.0,232709.0,97287.0,2527.0,434.0
1,Andhra Pradesh,29066610.0,10033874.0,30589830.0,10637326.0,217604.0,86209.0
2,Arunachal Pradesh,9328395000.0,259157484.0,9501806000.0,286121142.0,24773032.0,3851952.0
3,Assam,10189220.0,2150197.0,11430050.0,2442900.0,177263.0,41815.0
4,Bihar,22945520.0,4399665.0,25167380.0,4859267.0,317409.0,65658.0


## CENSUSDATA PREPROCESSING


In [25]:
censuscols = ['Level','Name','TRU','TOT_P']

census_data = pd.read_excel(r'DDW_PCA0000_2011_Indiastatedist.xlsx',usecols=censuscols)

In [26]:
census_data.head()

Unnamed: 0,Level,Name,TRU,TOT_P
0,India,India,Total,1210854977
1,India,India,Rural,833748852
2,India,India,Urban,377106125
3,STATE,JAMMU & KASHMIR,Total,12541302
4,STATE,JAMMU & KASHMIR,Rural,9108060


In [27]:
#dropping rows OF TRU =(rural,urban)
a = census_data.loc[(census_data['TRU']=='Rural') | (census_data['TRU']=='Urban')].index
census_data = census_data.drop(a)

In [28]:
census_data = census_data[census_data['Level'] == 'STATE']

In [29]:
census_data.reset_index(inplace=True,drop=True)

In [30]:
census_data.head()

Unnamed: 0,Level,Name,TRU,TOT_P
0,STATE,JAMMU & KASHMIR,Total,12541302
1,STATE,HIMACHAL PRADESH,Total,6864602
2,STATE,PUNJAB,Total,27743338
3,STATE,CHANDIGARH,Total,1055450
4,STATE,UTTARAKHAND,Total,10086292


In [31]:
print(census_data[census_data['Name']=='DADRA & NAGAR HAVELI'].values)
census_data[census_data['Name']=='DAMAN & DIU'].values

[['STATE' 'DADRA & NAGAR HAVELI' 'Total' 343709]]


array([['STATE', 'DAMAN & DIU', 'Total', 243247]], dtype=object)

In [32]:
#use this
#combining (Dadra and Nagar Haveli and Daman and Diu) into Dadra and Nagar Haveli
census_data.iloc[25,-1] = census_data.iloc[24,-1] + census_data.iloc[25,-1]

census_data.iloc[25,1] = ['Dadra and Nagar Haveli and Daman and Diu']  #CHANGING NAME 
#NEED NOT REMOVE DAMAN and DIU

In [33]:
census_data.iloc[25,1]

'Dadra and Nagar Haveli and Daman and Diu'

MERGING VACCINEDATA AND CENSUSDATA

In [34]:
newdf['State'] = newdf['State'].str.lower()

census_data['Name'] = census_data['Name'].str.lower()

In [35]:
census_data['Name'].replace('nct of delhi','delhi',inplace=True)
census_data['Name'] = census_data['Name'].str.replace('&','and')

In [36]:
t = census_data.sort_values(['Name'])
t.reset_index(drop=True,inplace=True)

In [37]:
census_data.head()

Unnamed: 0,Level,Name,TRU,TOT_P
0,STATE,jammu and kashmir,Total,12541302
1,STATE,himachal pradesh,Total,6864602
2,STATE,punjab,Total,27743338
3,STATE,chandigarh,Total,1055450
4,STATE,uttarakhand,Total,10086292


In [38]:
# temp = pd.concat([newdf['State'],census_data['Name']],axis=1)

In [39]:
census_data.rename(columns={'Name':'State'},inplace=True)

In [40]:
# MERGING ON STATENAMES
finaldf = pd.merge(newdf,census_data,on=['State','State'])

In [41]:
finaldf.columns

Index(['State', 'dose1_x', 'dose2_x', 'dose1_y', 'dose2_y', 'dose1', 'dose2',
       'Level', 'TRU', 'TOT_P'],
      dtype='object')

In [42]:
finaldf['populationleft'] = finaldf['TOT_P']-finaldf['dose1_y']  #population left = total - total vaccinated
finaldf['nod'] = finaldf['populationleft']/finaldf['dose1']

In [43]:
finaldf['nod'] = finaldf['nod'].apply(np.ceil)

In [44]:
finaldf2 = finaldf[['State','populationleft','dose1','nod']]

PREDICTING DATE OF VACCINATION COMPLETELY FOR ALL STATES

In [45]:
#END_DATE
lastdate = pd.to_datetime('2021-08-14')
# # days = datetime.timedelta(days=1)
# days = pd.to_timedelta(59,unit='d')

In [46]:
#PREDICTING THE DATE INTO NEWCOLUMN
finaldf2['date'] = lastdate + pd.to_timedelta(finaldf2['nod'], unit='d')

In [47]:
finaldf2.head()

Unnamed: 0,State,populationleft,dose1,nod,date
0,andaman and nicobar islands,147872.0,2527.0,59.0,2021-10-12
1,andhra pradesh,53990950.0,217604.0,249.0,2022-04-20
2,arunachal pradesh,-9500422000.0,24773032.0,-383.0,2020-07-27
3,assam,19775520.0,177263.0,112.0,2021-12-04
4,bihar,78932070.0,317409.0,249.0,2022-04-20


In [48]:
finaldf2.rename(columns={'dose1':'rateof vaccination','State':'stateid'},inplace=True)

In [49]:
#AS WE ARE GETTING NEGATIVE VALUES FOR STATES --"GUJARAT(10),arunachal pradesh(2) and 
# dadra and nagar haveli and daman and diu(7)"   so we change.  and keeping the lastdate of analysis

finaldf2.iloc[2,-1] =lastdate      #for arunachal pradesh
finaldf2.iloc[7,-1] =lastdate      #for dadra and nagar haveli and daman and diu
finaldf2.iloc[10,-1] =lastdate     # for gujarat




# census_data.iloc[25,-1] = census_data.iloc[24,-1] + census_data.iloc[25,-1]

In [50]:
finaldf2.columns

Index(['stateid', 'populationleft', 'rateof vaccination', 'nod', 'date'], dtype='object')

In [51]:
finaldf2 = finaldf2[['stateid', 'populationleft', 'rateof vaccination']]

In [52]:
#dictionary to map statename to stateid
name_to_id = {'state unassigned': 'UN','andaman and nicobar islands': 'AN','andhra pradesh': 'AP','arunachal pradesh': 'AR','assam': 'AS','bihar': 'BR','chandigarh': 'CH','chhattisgarh': 'CT','delhi': 'DL',
              'dadra and nagar haveli and daman and diu': 'DN','goa': 'GA','gujarat': 'GJ','himachal pradesh': 'HP','haryana': 'HR','jharkhand': 'JH','jammu and kashmir': 'JK','karnataka': 'KA',
              'kerala': 'KL','ladakh': 'LA','lakshadweep': 'LD','maharashtra': 'MH','meghalaya': 'ML','manipur': 'MN','madhya pradesh': 'MP','mizoram': 'MZ','nagaland': 'NL','odisha': 'OR',
              'punjab': 'PB','puducherry': 'PY','rajasthan': 'RJ','sikkim': 'SK','telangana': 'TG','tamil nadu': 'TN','tripura': 'TR','uttar pradesh': 'UP','uttarakhand': 'UT','west bengal': 'WB'}

In [53]:
# comment -->I was informed very late so i have done this type of mapping of statename to state.

In [54]:
#mapped
finaldf2['stateid'] = finaldf2['stateid'].map(name_to_id)

In [55]:
finaldf2.to_csv('complete-vaccination.csv',index=False)