In [1]:
import pandas as pd
from matplotlib.pyplot import plot, bar

# config

In [2]:
read_from = 'corona_lb_data.csv'
save_to = '..\\corona_lb_ratios.csv'

pd.set_option('display.max_rows', 255)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# read corona lb data

In [3]:
corona = pd.read_csv(read_from, header=0)
corona.head()

Unnamed: 0,date,year,month,day,new_cases,new_deaths,local,travelers,prev_cases,prev_deaths,total_cases,total_deaths
0,2/1/2020,2020,2,1,0,0,1,0,0,0,0,0
1,2/2/2020,2020,2,2,0,0,1,0,0,0,0,0
2,2/3/2020,2020,2,3,0,0,1,0,0,0,0,0
3,2/4/2020,2020,2,4,0,0,1,0,0,0,0,0
4,2/5/2020,2020,2,5,0,0,1,0,0,0,0,0


# months data

In [4]:
month_groups = corona.groupby('month')

months_data = {}

for month, month_data in month_groups:
    average_cases = int(month_data.new_cases.mean())
    average_death = int(month_data.new_deaths.mean())
    
    month_cases = int(month_data.new_cases.sum())
    month_deaths = int(month_data.new_deaths.sum())
    
    comulative_cases = int(month_data.total_cases.max())
    comulative_deaths = int(month_data.total_deaths.max())
    
    months_data[month] = {
        'average_cases':average_cases,
        'average_deaths':average_death,
        'month_cases': month_cases,
        'month_deaths': month_deaths,
        'comulative_cases': comulative_cases,
        'comulative_deaths': comulative_deaths,
    }

# sentiment ratio

In [5]:
cases_day_count = deaths_day_count = 0
cases_diff_sum = 0

def calculate(row):
    global cases_day_count, deaths_day_count, cases_diff_sum
    
    if row.new_cases:
        cases_day_count += 1
       
    if row.new_deaths:
        deaths_day_count += 1

    # ----------------------------------------------------------------------

    # method 1 - month average, by overall month data
    avg_cases1 = months_data[row.month]['average_cases']
    avg_deaths1 = months_data[row.month]['average_deaths']
    
    cases_ratio1 = row.new_cases / avg_cases1 if avg_cases1 else 0
    deaths_ratio1 = row.new_deaths / avg_deaths1 if avg_deaths1 else 0

    sentiment_ratio1 = (cases_ratio1 + deaths_ratio1) / 2
    
    # ----------------------------------------------------------------------

    # method 2 - cumulative average, by cumulative cases days data
    avg_cases2 = (row.total_cases / cases_day_count) if cases_day_count else 0
    avg_deaths2 = (row.total_deaths / deaths_day_count) if deaths_day_count else 0

    cases_ratio2 = row.new_cases / avg_cases2 if avg_cases2 else 0
    deaths_ratio2 = row.new_deaths / avg_deaths2 if avg_deaths2 else 0

    sentiment_ratio2 = (cases_ratio2 + deaths_ratio2) / 2
    
    # ----------------------------------------------------------------------
    
    # method 3 - cumulative diff average, by cumulative cases days diff data (death data is of no use here)
    cases_diff = row.prev_cases - row.new_cases

    avg_cases3 = (cases_diff_sum / cases_day_count) if cases_day_count else 0
    
    if row.new_cases:
        cases_diff_sum += abs(cases_diff)

    sentiment_ratio3 = cases_diff / avg_cases3 if avg_cases3 else 0

    # ----------------------------------------------------------------------
        
    # method 4 - simple direct ratio
    cases_max, cases_min = (row.new_cases, -row.prev_cases) if row.prev_cases < row.new_cases else (row.prev_cases, row.new_cases)
    cases_ratio4 = cases_min / cases_max if cases_max else 0
    
    deaths_max, deaths_min = (row.new_deaths, -row.prev_deaths) if row.prev_deaths < row.new_deaths else (row.prev_deaths, row.new_deaths)
    deaths_ratio4 = deaths_min / deaths_max if deaths_max else 0

    sentiment_ratio4 = (cases_ratio4 + deaths_ratio4) / 2

    # ----------------------------------------------------------------------
        
    return pd.Series([sentiment_ratio1, sentiment_ratio2, sentiment_ratio3, sentiment_ratio4])

corona[['month_avg_ratio', 'cumulative_avg_ratio', 'diff_avg_ratio', 'simple_ratio']] = corona.apply(calculate, axis=1)

In [6]:
corona.describe()

Unnamed: 0,year,month,day,new_cases,new_deaths,local,travelers,prev_cases,prev_deaths,total_cases,total_deaths,month_avg_ratio,cumulative_avg_ratio,diff_avg_ratio,simple_ratio
count,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0
mean,2020.0,5.518519,15.695473,163.135802,1.555556,0.967078,0.045267,157.962963,1.530864,5333.716049,70.176955,0.590286,1.780971,-0.366302,0.023823
std,0.0,2.284213,8.795534,276.882039,2.746899,0.1788,0.208319,267.959682,2.733719,9056.468515,86.996566,0.498411,1.727108,3.224766,0.386566
min,2020.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.273006,-0.895448
25%,2020.0,4.0,8.0,6.0,0.0,1.0,0.0,6.0,0.0,499.5,24.0,0.195506,0.357352,-1.690238,-0.25809
50%,2020.0,6.0,16.0,18.0,0.0,1.0,0.0,18.0,0.0,1241.0,38.0,0.510949,0.992204,0.0,0.0
75%,2020.0,7.5,23.0,175.0,2.0,1.0,0.0,171.5,2.0,4654.5,71.0,0.882241,3.37413,0.760818,0.313393
max,2020.0,9.0,31.0,1280.0,18.0,1.0,1.0,1280.0,18.0,39642.0,378.0,2.647202,6.333355,14.109122,1.0


In [7]:
corona

Unnamed: 0,date,year,month,day,new_cases,new_deaths,local,travelers,prev_cases,prev_deaths,total_cases,total_deaths,month_avg_ratio,cumulative_avg_ratio,diff_avg_ratio,simple_ratio
0,2/1/2020,2020,2,1,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
1,2/2/2020,2020,2,2,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
2,2/3/2020,2020,2,3,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
3,2/4/2020,2020,2,4,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
4,2/5/2020,2020,2,5,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
5,2/6/2020,2020,2,6,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
6,2/7/2020,2020,2,7,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
7,2/8/2020,2020,2,8,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
8,2/9/2020,2020,2,9,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0
9,2/10/2020,2020,2,10,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0


# scale ratios into range [-1, 1] using MinMaxScaler()

In [8]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def min_max_scale(df, column_name, scale=(-1, 1)):
    _2d_array = np.array(corona[column_name]).reshape(-1, 1)
    scaler = MinMaxScaler(scale)
    data = scaler.fit_transform(_2d_array)
    print('Max:', scaler.data_max_)
    print('Min:', scaler.data_min_)
    column_data = list(data.reshape(1, -1)[0])
    df[column_name] = pd.Series(column_data)

In [9]:
min_max_scale(corona, 'month_avg_ratio')

Max: [2.64720195]
Min: [0.]


In [10]:
min_max_scale(corona, 'cumulative_avg_ratio')

Max: [6.33335505]
Min: [0.]


In [11]:
min_max_scale(corona, 'diff_avg_ratio')

Max: [14.10912155]
Min: [-11.27300613]


In [12]:
min_max_scale(corona, 'simple_ratio')

Max: [1.]
Min: [-0.8954484]


In [13]:
corona.describe()

Unnamed: 0,year,month,day,new_cases,new_deaths,local,travelers,prev_cases,prev_deaths,total_cases,total_deaths,month_avg_ratio,cumulative_avg_ratio,diff_avg_ratio,simple_ratio
count,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0
mean,2020.0,5.518519,15.695473,163.135802,1.555556,0.967078,0.045267,157.962963,1.530864,5333.716049,70.176955,-0.55403,-0.43759,-0.1406,-0.030023
std,0.0,2.284213,8.795534,276.882039,2.746899,0.1788,0.208319,267.959682,2.733719,9056.468515,86.996566,0.376557,0.545401,0.254097,0.407889
min,2020.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0
25%,2020.0,4.0,8.0,6.0,0.0,1.0,0.0,6.0,0.0,499.5,24.0,-0.852293,-0.887152,-0.24492,-0.327485
50%,2020.0,6.0,16.0,18.0,0.0,1.0,0.0,18.0,0.0,1241.0,38.0,-0.613971,-0.686674,-0.111737,-0.055159
75%,2020.0,7.5,23.0,175.0,2.0,1.0,0.0,171.5,2.0,4654.5,71.0,-0.333454,0.065511,-0.051788,0.27552
max,2020.0,9.0,31.0,1280.0,18.0,1.0,1.0,1280.0,18.0,39642.0,378.0,1.0,1.0,1.0,1.0


# save

In [14]:
corona.to_csv(save_to, index=False)