<a href="https://colab.research.google.com/github/geande/covid-19-predictor/blob/main/dataCollection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas
import jax.numpy as np
import matplotlib.pyplot as plt

In [None]:
# Create a dictionary of all existing state .csv repositories for looping
csvDictionary = ['alabama-history.csv', 'alaska-history.csv', 'arizona-history.csv', 
                         'arkansas-history.csv', 'california-history.csv', 'colorado-history.csv',
                         'connecticut-history.csv', 'delaware-history.csv', 'florida-history.csv', 
                         'georgia-history.csv', 'hawaii-history.csv', 'idaho-history.csv', 'illinois-history.csv',
                         'indiana-history.csv', 'iowa-history.csv', 'kansas-history.csv', 'kentucky-history.csv',
                         'louisiana-history.csv', 'maine-history.csv', 'maryland-history.csv', 'massachusetts-history.csv', 
                         'michigan-history.csv', 'minnesota-history.csv', 'mississippi-history.csv', 'missouri-history.csv', 
                         'montana-history.csv', 'nebraska-history.csv', 'nevada-history.csv', 'new-hampshire-history.csv',
                         'new-jersey-history.csv', 'new-mexico-history.csv', 'new-york-history.csv', 'north-carolina-history.csv',
                         'north-dakota-history.csv', 'ohio-history.csv', 'oklahoma-history.csv', 'oregon-history.csv',
                         'pennsylvania-history.csv', 'rhode-island-history.csv', 'south-carolina-history.csv', 'south-dakota-history.csv',
                         'tennessee-history.csv', 'texas-history.csv', 'utah-history.csv', 'vermont-history.csv', 'virginia-history.csv',
                         'washington-history.csv', 'west-virginia-history.csv', 'wisconsin-history.csv', 'wyoming-history.csv',]

In [None]:
pops = pandas.read_csv('/State Populations.csv', index_col=['State'], usecols=['State', '2018 Population'])
sorted = pops.sort_index()
iter = sorted.iterrows()

populations = []
state_names = []

for i,j in iter:
  state_name = i
  if (not (state_name == 'District of Columbia')):
    population = j['2018 Population']
    populations.append(population)
    state_names.append(state_name)

In [None]:
def state_basis(state_history, population, state_name):
  path = 'https://covidtracking.com/data/download/'
  dat = path + state_history
  print(dat)
  txt = pandas.read_csv(dat, index_col=['date'], usecols=['date', 'dataQualityGrade', 'positive', 'positiveIncrease', 'death', 'recovered'])
  N = population

  # use this as a basic filter for data accuracy (alot of states have really poor data in the early onset of Covid-19)
  lockdown_approx = 0

  # here im iterating through the csv and collecting the data for each of the fields that were interested in tracking
  positive = []
  positiveInc = []
  death = []
  recovered = []

  oter = txt.iterrows()

  for x,y in oter:

    I = 0
    I_inc = 0
    D = 0
    R = 0

    if (y["positive"] >= 0):
      I = y["positive"]
    if (y["positiveIncrease"] >= 0):
      I_inc = y["positiveIncrease"]
    if (y["death"] >= 0):
      D = y["death"]
    if (y["recovered"] >= 0):
      R = y["recovered"]

    positive.append(I)
    positiveInc.append(I_inc)
    death.append(D)
    recovered.append(R)


  positive.reverse()
  positiveInc.reverse()
  death.reverse()
  recovered.reverse()

  Infected_data = np.array(positive[lockdown_approx:]) # state tracking the total infected individuals

  Deceased_data = np.array(death[lockdown_approx:]) # state tracking the total deceased individuals

  Recovered_data = np.array(recovered[lockdown_approx:]) # state tracking the total recovered individuals

  Removed_data = np.add(Recovered_data, Deceased_data) # state tracking the total number of removed individuals

  CurrentlyInfected_data = np.subtract(Infected_data, Removed_data)

  CumulativeCaseload_data = Infected_data # state tracking the cumulative caseload fraction

  Susceptible_data = N - CumulativeCaseload_data # state tracking the susceptible fraction

  data_Collected = np.vstack((Susceptible_data, CurrentlyInfected_data, Recovered_data, Deceased_data, CumulativeCaseload_data))

  plt.figure(figsize=(25,10), facecolor="w")
  plt.subplot(1,2,1)
  plt.plot(CurrentlyInfected_data)
  plt.plot(Recovered_data)
  plt.plot(Deceased_data)
  plt.plot(Removed_data)
  plt.plot(CumulativeCaseload_data)
  plt.title("Covid-19 Dynamics in " + state_name)
  plt.xlabel("Time (days)")
  plt.xlim(left=lockdown_approx)
  plt.ylabel("Number of Individuals")
  plt.legend(("Currently Infected", "Recovered", "Deceased", "Removed", "Cumulative Caseload"))
  plt.subplot(1,2,2)
  plt.plot(Susceptible_data)
  plt.title("Population Susceptible to Covid-19 in " + state_name)
  plt.xlabel("Time (days)")
  plt.ylabel("Number of Individuals")

  return data_Collected

Need to find a way to effectively import all the data as CSVs

We have done so by creating a list of all the data collected by each state.  Each index in the list matches the index of the state name in the sorted list states.

In [None]:
all_states_data = []

for i in range(0, len(csvDictionary)):
  arr = state_basis(csvDictionary[i], populations[i], state_names[i])
  all_states_data.append(arr)