In [None]:
#Read data csv from:
#https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#reading data from file
df = pd.read_csv('covid.csv')

#Screening columns, data head and certain data location
print(df.columns)
#print(df.head())
#print(df.loc[30000,:])

#Selecting certain country in data columns
df1 = df.loc[df['location'] == 'Indonesia']
#df1=df1[['location','date','new_cases_smoothed_per_million','weekly_hosp_admissions_per_million','people_fully_vaccinated_per_hundred','stringency_index','reproduction_rate']]
#print(df1.tail())
df2 = df.loc[df['location'] == 'United Kingdom']
#df2=df2[['location','date','new_cases_smoothed_per_million','weekly_hosp_admissions_per_million','people_fully_vaccinated_per_hundred','stringency_index','reproduction_rate']]
#print(df2.loc[df2['date'] == '2021-03-15'])
#print(df2.tail())

#Reseting data index
df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)
#print(df1.tail())

#Trimming data to the same length
#print(df2)
#print(len(df2) - len(df1))
df2=df2.iloc[len(df2) - len(df1):]
df2.reset_index(drop=True, inplace=True)
#print(df2.tail())

#Setting the last XX days data, 720 = 24 months
df1=df1.iloc[len(df1)-720:800]
df2=df2.iloc[len(df2)-720:800]
#print(df1)
#print(df2)

plt.plot(df1['new_cases_smoothed_per_million'])
plt.plot(df2['new_cases_smoothed_per_million'],'r')
plt.grid()
plt.show()
plt.plot(df1['weekly_hosp_admissions_per_million'])
plt.plot(df2['weekly_hosp_admissions_per_million'],'r')
plt.grid()
plt.show()
plt.plot(df1['weekly_hosp_admissions_per_million'])
plt.plot(df2['weekly_hosp_admissions_per_million'],'r')
plt.grid()

In [None]:
#Picking data from a certain date
df3=df.loc[df['date'] == '2022-05-01']

#Selecting certain parameters for clustering
df3=df3[['location','reproduction_rate', 'new_cases_smoothed_per_million', 'people_fully_vaccinated_per_hundred','weekly_hosp_admissions_per_million']]
print(df3.shape)
print(df3.head(10))

#Selecting data from Indonesia for plotting
df4=df3.loc[df['location'] == 'Indonesia']
print(df4)

#Sorting values by certain parameters
df3=df3.sort_values(by=['weekly_hosp_admissions_per_million'], ascending=False)

#deleting the first XX rows of data
#df3=df3[9:]
print(df3.head(30))

#plt.scatter(df3['new_cases'], df3['new_deaths'], marker="o", s=50, edgecolor="k", projection="3d")
#plt.grid()
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(df3['new_cases_smoothed_per_million'], df3['weekly_hosp_admissions_per_million'], df3['people_fully_vaccinated_per_hundred'], s=50, edgecolor="k")
ax.scatter(df4['new_cases_smoothed_per_million'], df4['weekly_hosp_admissions_per_million'], df4['people_fully_vaccinated_per_hundred'], c='r', marker="s", s=200, edgecolor="k")
plt.gca().update(dict(title='COVID19', xlabel='new_cases_smoothed_per_million', ylabel='weekly_hosp_admissions_per_million',zlabel='people_fully_vaccinated_per_hundred'))

In [None]:
#Selecting certain parameters for prediction
df4=df3[['new_cases_smoothed_per_million','weekly_hosp_admissions_per_million']]
print(df4.shape)
print(df4.head())

df4=df4.dropna()
df4.reset_index(drop=True, inplace=True)
print(df4.shape)
print(df4.head())

In [None]:
from sklearn.model_selection import train_test_split

x=df4[['new_cases_smoothed_per_million']].values
y=df4[['weekly_hosp_admissions_per_million']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

#print(x[0:5])
#print(y[0:5])
#print(y_test[0:5])

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(x_train, y_train)
print(reg.score(x_train, y_train))
print(reg.coef_)
print(reg.intercept_)

y_pred=reg.predict(x_test)

#Plotting Total Deaths vs Total Cases and the regression line
import seaborn as sns
#create scatterplot with regression line
sns.regplot(x=x_train, y=y_train)
plt.grid()

#Plotting y_prediction & y_test
plt.show()
plt.plot(y_test)
plt.plot(y_pred,'r')
plt.grid()
print(len(y_pred))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#Selecting certain parameters for prediction
df5=df3[['people_fully_vaccinated_per_hundred','weekly_hosp_admissions_per_million']]
print(df5.shape)
print(df5.head())

df5=df5.dropna()
df5.reset_index(drop=True, inplace=True)
print(df5.shape)
print(df5.head())

In [None]:
from sklearn.model_selection import train_test_split

x=df5[['people_fully_vaccinated_per_hundred']].values
y=df5[['weekly_hosp_admissions_per_million']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

#print(x[0:5])
#print(y[0:5])
#print(y_test[0:5])

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(x_train, y_train)
print(reg.score(x_train, y_train))
print(reg.coef_)
print(reg.intercept_)

y_pred=reg.predict(x_test)

#Plotting Total Deaths vs Total Cases and the regression line
import seaborn as sns
#create scatterplot with regression line
sns.regplot(x=x_train, y=y_train)
plt.grid()

#Plotting y_prediction & y_test
plt.show()
plt.plot(y_test)
plt.plot(y_pred,'r')
plt.grid()
print(len(y_pred))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#Selecting certain parameters for prediction
df3=df3[['reproduction_rate','people_fully_vaccinated_per_hundred','new_cases_smoothed_per_million','weekly_hosp_admissions_per_million']]
print(df3.shape)
print(df3.head())

df3=df3.dropna()
df3.reset_index(drop=True, inplace=True)
print(df3.shape)
print(df3.head())

In [None]:
from sklearn.model_selection import train_test_split

x=df3[['reproduction_rate','people_fully_vaccinated_per_hundred','new_cases_smoothed_per_million']].values
y=df3[['weekly_hosp_admissions_per_million']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

#print(x[0:5])
#print(y[0:5])
#print(y_test[0:5])

from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(x_train, y_train)
print(reg.score(x_train, y_train))
print(reg.coef_)
print(reg.intercept_)

y_pred=reg.predict(x_test)

plt.plot(y_test)
plt.plot(y_pred,'r')
plt.grid()
print(len(y_pred))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)