In [104]:
import pandas as pd
from textblob import TextBlob
import re
import json

# config

In [105]:
read_from = '..\\tweets.csv'
save_to = 'tweets_sentiment.csv'
do_sentiment = 0
cases_data = 'corona_average_per_month.json'

# load data

In [106]:
df = pd.read_csv(read_from, header=0)

# get sentiment

In [107]:
def get_sentiment_label(sentiment):
    if sentiment == 0:
        return 'Neutral'
    return 'Positive' if sentiment > 0 else 'Negative'

def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    return pd.Series([sentiment, get_sentiment_label(sentiment)])

In [108]:
if do_sentiment:
    df[['sentiment', 'sentiment_label']] = df.translated_text.apply(get_sentiment)

In [109]:
df.sentiment_label.value_counts()

Positive    18717
Neutral     18680
Negative     7674
Name: sentiment_label, dtype: int64

# split by date

In [110]:
df[['year', 'month', 'day']] = df.date.str.split('-', expand=True)

# month data info

In [111]:
sorted(df.month.unique())

['02', '03', '04', '05', '06', '07', '08']

In [112]:
month_groups = df.groupby('month')
print('Cases Data over months & days')
print('******************************')
for month, month_data in month_groups:
    print('[Month]:', month)
    print('[Days]:', "-".join(sorted(month_data.day.unique())))
    print('------------------------------------------')

month_groups = None

Cases Data over months & days
******************************
[Month]: 02
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29
------------------------------------------
[Month]: 03
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31
------------------------------------------
[Month]: 04
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30
------------------------------------------
[Month]: 05
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31
------------------------------------------
[Month]: 06
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30
------------------------------------------
[Month]: 07
[Days]: 01-02-03-04-05-06-07-08-09-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31
------------------------------------------
[Month]: 08
[Da

# reports about cases

In [113]:
keywords = ['cases', 'daily', 'report', 'اصابات', 'اصابة', 'حالات', 'حالة', 'اليومي', 'التقرير']

In [114]:
def contains_keyword(text):
    words = text.split()
    for w in words:
        if w in keywords:
            return True
    return False

reports = df[df.text.apply(contains_keyword)]

In [115]:
reports.sentiment_label.value_counts()

Positive    1736
Neutral      860
Negative     400
Name: sentiment_label, dtype: int64

In [116]:
len(reports)  # total reports

2996

# months data

In [117]:
with open(cases_data) as f:
    months_data = json.loads(f.read())   

# month + cases info

In [118]:
reports_data = {}

def get_nearest(numbers, m_avg, m_total):
    nearest = (0, None) # (number, difference)
    for number in numbers:
        if number < m_total:
            diff = abs(number - m_avg)
            if diff < 2 * m_avg:
                if not nearest[0] or diff < nearest[1]:
                    nearest = (number, diff)
    return nearest[0]

def info(row):
    global reports_data
    
    keys = "|".join(keywords)
    pttrn = re.compile(fr'(?:{keys})\W(\d+)|(\d+)\W(?:{keys})') # only digits
    groups = re.findall(pttrn, row.text)
    
    
    m_avg = months_data[row.month]['average_cases']
    m_total = months_data[row.month]['month_cases']
    key = (
        row.month, 
        m_avg, 
        m_total,
        months_data[row.month]['comulative_cases'],
        row.day
    )
    
    numbers = [int(item) for sublist in groups for item in sublist if item]
    if key in reports_data:
        numbers = [*numbers, reports_data[key]]
        
    reports_data[key] = get_nearest(numbers, m_avg, m_total)


reports.apply(info, axis=1)

data = []
for key, value in reports_data.items():
    data.append([*key, value])
    
reports_data = (
    pd.DataFrame(data, columns=['month','avg_month_cases','total_month_cases','comulative_overall_cases','day','day_cases']) \
    .sort_values(by=['month', 'day']) \
    .reset_index(drop=True)
)

In [119]:
reports_data

Unnamed: 0,month,avg_month_cases,total_month_cases,comulative_overall_cases,day,day_cases
0,02,0,3,3,02,0
1,02,0,3,3,03,0
2,02,0,3,3,07,0
3,02,0,3,3,08,0
4,02,0,3,3,10,0
...,...,...,...,...,...,...
169,07,83,2589,4334,30,129
170,07,83,2589,4334,31,2
171,08,404,12536,16870,01,224
172,08,404,12536,16870,02,155


In [120]:
reports_data.groupby(['month', 'avg_month_cases', 'total_month_cases', 'comulative_overall_cases']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,day_cases
month,avg_month_cases,total_month_cases,comulative_overall_cases,Unnamed: 4_level_1
2,0,3,3,0
3,16,443,446,413
4,9,275,721,221
5,15,470,1191,497
6,18,554,1745,542
7,83,2589,4334,1215
8,404,12536,16870,556


In [124]:
groups = reports_data.groupby(['month', 'avg_month_cases', 'total_month_cases', 'comulative_overall_cases'])
groups.get_group(('07', 83, 2589, 4334))

Unnamed: 0,month,avg_month_cases,total_month_cases,comulative_overall_cases,day,day_cases
140,7,83,2589,4334,1,19
141,7,83,2589,4334,2,19
142,7,83,2589,4334,3,34
143,7,83,2589,4334,4,25
144,7,83,2589,4334,5,19
145,7,83,2589,4334,6,19
146,7,83,2589,4334,7,22
147,7,83,2589,4334,8,39
148,7,83,2589,4334,9,66
149,7,83,2589,4334,10,71


# new sentiment by reports

In [122]:
new_sentiment = 

SyntaxError: invalid syntax (<ipython-input-122-99ae704d84c0>, line 1)