In [72]:
import pandas as pd
from textblob import TextBlob

# config

In [114]:
cases_data = 'corona_lb_data.csv'
tweets_data = '..\\tweets.csv'
save_to = 'tweets_sentiment.csv'
pd.set_option('display.max_rows', 255)

# read corona lb data

In [79]:
corona = pd.read_csv(cases_data, header=0)
corona.head()

Unnamed: 0,date,year,month,day,total_cases,new_cases,total_deaths,new_deaths
0,2020-02-01,2020,2,1,0.0,0.0,0.0,0.0
1,2020-02-02,2020,2,2,0.0,0.0,0.0,0.0
2,2020-02-03,2020,2,3,0.0,0.0,0.0,0.0
3,2020-02-04,2020,2,4,0.0,0.0,0.0,0.0
4,2020-02-05,2020,2,5,0.0,0.0,0.0,0.0


# months data

In [80]:
month_groups = corona.groupby('month')

months_data = {}

for month, month_data in month_groups:
    average_cases = int(month_data.new_cases.mean())
    average_death = int(month_data.new_deaths.mean())
    
    month_cases = int(month_data.new_cases.sum())
    month_deaths = int(month_data.new_deaths.sum())
    
    comulative_cases = int(month_data.total_cases.max())
    comulative_deaths = int(month_data.total_deaths.max())
    
    months_data[month] = {
        'average_cases':average_cases,
        'average_deaths':average_death,
        'month_cases': month_cases,
        'month_deaths': month_deaths,
        'comulative_cases': comulative_cases,
        'comulative_dearhs': comulative_deaths,
    }

In [81]:
months_data

{2: {'average_cases': 0,
  'average_deaths': 0,
  'month_cases': 3,
  'month_deaths': 0,
  'comulative_cases': 3,
  'comulative_dearhs': 0},
 3: {'average_cases': 16,
  'average_deaths': 0,
  'month_cases': 443,
  'month_deaths': 11,
  'comulative_cases': 446,
  'comulative_dearhs': 11},
 4: {'average_cases': 9,
  'average_deaths': 0,
  'month_cases': 275,
  'month_deaths': 13,
  'comulative_cases': 721,
  'comulative_dearhs': 24},
 5: {'average_cases': 15,
  'average_deaths': 0,
  'month_cases': 470,
  'month_deaths': 2,
  'comulative_cases': 1191,
  'comulative_dearhs': 26},
 6: {'average_cases': 18,
  'average_deaths': 0,
  'month_cases': 554,
  'month_deaths': 8,
  'comulative_cases': 1745,
  'comulative_dearhs': 34},
 7: {'average_cases': 83,
  'average_deaths': 0,
  'month_cases': 2589,
  'month_deaths': 23,
  'comulative_cases': 4334,
  'comulative_dearhs': 57},
 8: {'average_cases': 404,
  'average_deaths': 3,
  'month_cases': 12536,
  'month_deaths': 103,
  'comulative_cases':

# add previous cases & deaths

In [82]:
prev_cases, prev_deaths = 0, 0
def get_data(row):
    global prev_cases, prev_deaths
    
    res = pd.Series([prev_cases, prev_deaths])
    
    if row.new_cases > 0:
        prev_cases = row.new_cases
    
    if row.new_deaths > 0:
        prev_deaths = row.new_deaths
    
    return res

corona[['prev_cases', 'prev_deaths']] = corona.apply(get_data, axis=1)
corona = corona.fillna(0)

In [83]:
corona

Unnamed: 0,date,year,month,day,total_cases,new_cases,total_deaths,new_deaths,prev_cases,prev_deaths
0,2020-02-01,2020,2,1,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-02-02,2020,2,2,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-02-03,2020,2,3,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-02-04,2020,2,4,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-02-05,2020,2,5,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-02-06,2020,2,6,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-02-07,2020,2,7,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-02-08,2020,2,8,0.0,0.0,0.0,0.0,0.0,0.0
8,2020-02-09,2020,2,9,0.0,0.0,0.0,0.0,0.0,0.0
9,2020-02-10,2020,2,10,0.0,0.0,0.0,0.0,0.0,0.0


# sentiment ratio

In [84]:
def calculate(row):
    avg_cases = months_data[row.month]['average_cases']
    avg_deaths = months_data[row.month]['average_deaths']
    
    cases_ratio = (row.prev_cases - row.new_cases) / avg_cases if avg_cases else 0
    deaths_ratio = (row.prev_deaths - row.new_deaths) / avg_deaths if avg_deaths else 0
    sentiment_ratio = cases_ratio + deaths_ratio
    
    return pd.Series([cases_ratio, deaths_ratio, sentiment_ratio])

corona[['cases_ratio', 'deaths_ratio', 'sentiment_ratio']] = corona.apply(calculate, axis=1)

In [85]:
corona

Unnamed: 0,date,year,month,day,total_cases,new_cases,total_deaths,new_deaths,prev_cases,prev_deaths,cases_ratio,deaths_ratio,sentiment_ratio
0,2020-02-01,2020,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-02-02,2020,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-02-03,2020,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-02-04,2020,2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-02-05,2020,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-02-06,2020,2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-02-07,2020,2,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-02-08,2020,2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2020-02-09,2020,2,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2020-02-10,2020,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
corona[['cases_ratio', 'deaths_ratio', 'sentiment_ratio']].describe()

Unnamed: 0,cases_ratio,deaths_ratio,sentiment_ratio
count,223.0,223.0,223.0
mean,0.023987,0.028401,0.052388
std,0.810287,0.493492,0.961198
min,-3.733333,-2.333333,-3.733333
25%,-0.213521,0.0,-0.337349
50%,0.0,0.0,0.0
75%,0.2,0.0,0.354167
max,3.4,3.666667,3.594884


# scale ratio into range [-1, 1] using MinMaxScaler()

In [87]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

_2d_array = np.array(corona.sentiment_ratio).reshape(-1, 1)

In [88]:
scaler = MinMaxScaler((-1, 1))
data = scaler.fit_transform(_2d_array)

In [89]:
scaler.data_max_

array([3.59488449])

In [90]:
scaler.data_min_

array([-3.73333333])

In [91]:
sentiment_ratio = list(data.reshape(1, -1)[0])

In [92]:
corona['scaled_sentiment_ratio'] = pd.Series(sentiment_ratio)

In [93]:
corona.scaled_sentiment_ratio.describe()

count    223.000000
mean       0.033190
std        0.262328
min       -1.000000
25%       -0.073176
50%        0.018893
75%        0.115551
max        1.000000
Name: scaled_sentiment_ratio, dtype: float64

In [100]:
data = corona[['month', 'day', 'prev_cases', 'new_cases', 'total_cases', 'prev_deaths', 'new_deaths', 'total_deaths', 'scaled_sentiment_ratio']]
data

Unnamed: 0,month,day,prev_cases,new_cases,total_cases,prev_deaths,new_deaths,total_deaths,scaled_sentiment_ratio
0,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
1,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
2,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
3,2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
4,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
5,2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
6,2,7,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
7,2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
8,2,9,0.0,0.0,0.0,0.0,0.0,0.0,0.018893
9,2,10,0.0,0.0,0.0,0.0,0.0,0.0,0.018893


# convert data to dict for easier access

In [111]:
sentiment_ratio_dict = {}
for row in data.iterrows():
    row = row[1]
    month = f'0{int(row.month)}'
    day = str(int(row.day)) if row.day > 10 else f'0{int(row.day)}'
    
    key = (month, day)
    sentiment_ratio_dict[key] = {
        'prev_cases': int(row.prev_cases),
        'new_cases': int(row.new_cases),
        'total_cases': int(row.total_cases),
        'prev_deaths': int(row.prev_deaths),
        'new_deaths': int(row.new_deaths),
        'total_deaths': int(row.total_deaths),
        'sentiment_ratio': row.scaled_sentiment_ratio
    }
    
sentiment_ratio_dict

{('02', '01'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths': 0,
  'sentiment_ratio': 0.018892566822040546},
 ('02', '02'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths': 0,
  'sentiment_ratio': 0.018892566822040546},
 ('02', '03'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths': 0,
  'sentiment_ratio': 0.018892566822040546},
 ('02', '04'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths': 0,
  'sentiment_ratio': 0.018892566822040546},
 ('02', '05'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths': 0,
  'sentiment_ratio': 0.018892566822040546},
 ('02', '06'): {'prev_cases': 0,
  'new_cases': 0,
  'total_cases': 0,
  'prev_deaths': 0,
  'new_deaths': 0,
  'total_deaths

# clean memory

In [112]:
corona = None
month_groups = None
months_data = None
_2d_array = None
scaler = None
data = None
sentiment_ratio = None

# load tweets

In [123]:
tweets = pd.read_csv(tweets_data, header=0)
tweets.head()

Unnamed: 0,username,date,text,tags,translated_text,sentiment,location,lat,long,sentiment_label,KADAA_ID,KADAA_AR,KADAA_EN,MOHAFAZA_ID,MOHAFAZA_AR,MOHAFAZA_EN
0,Lebanon 24,2020-07-12,"#كورونا يتلف رئة ""أبو عزرائيل"" (صورة) #العراق#...",#كورونا;#العراق;#lebanon24,"Corona damages the lung of ""Abu Azrael"" (photo...",0.0,صور,33.2733,35.1939,Neutral,83000,صور,Sour,8,الجنوب,South
1,Sana,2020-07-12,حضرة اساتذة القانون بالجامعة اللبنانية المحترم...,#كورونا;#امتحانات_الموت,Honorable professors of law at the Lebanese Un...,0.325,العربانية,33.8772,35.6786,Positive,51000,بعبدا,Baabda,5,جبل لبنان,Mount Lebanon
2,Jaras Scoop FM,2020-07-12,"كشفت ""رابطة طلاب الجامعة اللبنانية"" عبر تويتر ...",#كورونا,"The ""Lebanese University Students Association""...",-0.3,طلصا,33.4833,35.3471,Negative,81000,صيدا,Saida,8,الجنوب,South
3,Lebanon Debate,2020-07-12,"حالة ""#كورونا"" جديدة في الجامعة اللبنانية - ال...",#كورونا,"New ""# Corona"" case at the Lebanese University...",0.136364,حاقل,34.1705,35.7501,Positive,56000,جبيل,Jbeil,5,جبل لبنان,Mount Lebanon
4,عاجل - قناة فلسطين اليوم,2020-07-12,#عاجلمدير مكتب إعلام الأسرى ناهد الفاخوري: حذر...,#كورونا;#عاجل,#UrgentThe director of the Prisoners' Informat...,-0.283333,مكاتبة,33.5628,35.3789,Negative,81000,صيدا,Saida,8,الجنوب,South


# split by date

In [124]:
tweets[['year', 'month', 'day']] = tweets.date.str.split('-', expand=True)

# get sentiment

In [125]:
def get_sentiment_label(sentiment):
    if sentiment == 0:
        return 'Neutral'
    return 'Positive' if sentiment > 0 else 'Negative'

def get_sentiment(row):
    text_sentiment = TextBlob(row.translated_text).sentiment.polarity
    
    key = (row.month, row.day)
    ratio_sentiment = sentiment_ratio_dict.get(key)
    
    if ratio_sentiment:
        final_sentiment = text_sentiment + ratio_sentiment['sentiment_ratio']
        if final_sentiment <= -1 or final_sentiment >= 1:
            final_sentiment = float(int(final_sentiment))
    else:
        final_sentiment = text_sentiment
    
    return pd.Series([final_sentiment, get_sentiment_label(final_sentiment)])

In [126]:
tweets[['sentiment', 'sentiment_label']] = tweets.apply(get_sentiment, axis=1)

In [127]:
tweets.sentiment_label.value_counts()

Positive    28619
Negative    15618
Neutral       834
Name: sentiment_label, dtype: int64

In [132]:
tweets = tweets.drop(columns=['year', 'month', 'day'])
tweets.columns

Index(['username', 'date', 'text', 'tags', 'translated_text', 'sentiment',
       'location', 'lat', 'long', 'sentiment_label', 'KADAA_ID', 'KADAA_AR',
       'KADAA_EN', 'MOHAFAZA_ID', 'MOHAFAZA_AR', 'MOHAFAZA_EN'],
      dtype='object')

# save data

In [133]:
tweets.to_csv(save_to, index=False)