# Imports

In [1]:
import pandas as pd
from difflib import SequenceMatcher

# 1. Hotspot Words
- filter gathering related data
- get nouns 
- check frequencies

## 1.1 Get Tweets Related To Gatherings

In [None]:
with open('words.txt', 'r', encoding='utf-8') as f:
    hotspot_words = f.read().split('\n')

In [None]:
df = pd.read_csv('all.csv', header=0)

In [None]:
def hotspot_found(text):
    words = text.split()
    for hotspot_word in hotspot_words:
        for word in words:
            if SequenceMatcher(None, word.lower(), hotspot_word.lower()).ratio() >= 0.8:
                return True
    return False

In [None]:
hotspots_tweets_mask = df.translated_text.apply(hotspot_found)

In [None]:
hotspots_tweets_mask.value_counts()

In [None]:
for x in df[hotspots_tweets_mask].translated_text[:1000]:
    print(x)
    print('------------------------------------')

In [None]:
df[hotspots_tweets_mask].to_csv('hotspots_tweets_v1.csv', index=False)

## 1.2 Get sentences Nouns to filter non location related words

In [None]:
df = pd.read_csv('hotspots_tweets_v1.csv', header=0)

In [None]:
from textblob import TextBlob
df['nouns'] = df.translated_text.apply(lambda text: TextBlob(text).noun_phrases)

In [None]:
df.head()

In [None]:
df.to_csv('hotspots_tweets_v2.csv')

## 1.3 Filter non related nouns

In [None]:
df = pd.read_csv('hotspots_tweets_v2.csv', header=0)

In [None]:
nouns = df.nouns.str.replace("[", "").str.replace("]", "").str.split(',').explode().reset_index(drop=True)
nouns

In [None]:
filter_keywords = ['corona', 'twitter', 'http', 'pic', 'covid_19', 'god', 'covid19', 'd8', 'd9', 'lebanon', 'lebanese', 'covid', 'iran', 'house', 'stayhome', 'medical', 'healthcare', 'ministry', 'syrian', 'home', 'china', 
                  'your', 'syria', 'jordan', 'cases']

first = 1
for keyword in filter_keywords:
    if first:
        filter_condition = nouns.str.contains(keyword)
        first = 0
    else:
        filter_condition |= nouns.str.contains(keyword) 

filter1 = filter_condition
filter1.value_counts()

In [None]:
filtered_nouns = nouns[~filter1].reset_index(drop=True) # `~` is negation

In [None]:
filtered_nouns

## 1.4 Frequency

In [None]:
frequencies = filtered_nouns.value_counts()
frequencies

In [None]:
nouns_df = pd.DataFrame()
nouns_df['noun'] = filtered_nouns
nouns_df['frequency'] = filtered_nouns.replace(frequencies)
nouns_df

In [15]:
nouns_df[nouns_df.frequency > 100]

Unnamed: 0,noun,frequency
39,'hamad_hassan20',241
52,'hamad_hassan20',241
76,'saudi arabia',190
94,'hamad_hassan20',241
100,'hezbollah',217
...,...,...
121704,'thank',200
121761,'pcr',136
121766,'beirut',714
121815,'aubmc_official',155


## 1.5 Conclusion
- freauencies are not helpful

# 2. Check keywords (I don't see a future for this ;-;)

In [None]:
df = pd.read_csv('all.csv', header=0)

In [None]:
def get_stats(keyword):
    data = df[df.translated_text.str.contains(keyword)].translated_text
    print('`', keyword, '` was found in ', len(data), ' rows')
    print('------------------------------------')
    for d in data:
        print(d)
        print('------------------------------------')

In [None]:
get_stats('university')

In [None]:
get_stats('exam')

In [None]:
get_stats('prison')

In [None]:
get_stats('shop')

In [None]:
get_stats('store')

In [None]:
get_stats('gathering')

In [None]:
get_stats('eid')

In [None]:
get_stats('festival')

In [None]:
get_stats('party')

In [None]:
get_stats('marriage')

# 3. Check locations

In [None]:
with open('location_related_words.txt', 'r', encoding='utf-8') as f:
    location_words = f.read().split('\n')
location_words

In [None]:
def accepted(phrase):
    word = None
    for w in location_words:
        if w in phrase:
            word = w
            break
    return pd.Series([phrase, word])

filtered_nouns = pd.DataFrame()
filtered_nouns[['phrase', 'location_word']] = nouns.apply(accepted)
filtered_nouns

In [None]:
a = filtered_nouns[filtered_nouns['location_word'] != 'None']
a

# 4. Hotspot Words 2

In [2]:
df = pd.read_csv('all.csv', header=0)

In [46]:
column = 'text'
column = 'translated_text'

def contains_mask(keyword):
    return df[column].str.lower().str.contains(keyword)

def get_stats(keyword):
    res = contains_mask(keyword).value_counts()
    return keyword + f' ==> {res[1]} rows'

def multi_get_stats(keywords):
    for k in keywords:
        print('- ' + get_stats(k))
        
def get_rows(keyword):
    rows = df[contains_mask(keyword)][column]
    for row in rows:
        print(row)
        print('--------------------')

In [59]:
multi_get_stats(['university', 'prison', 'marriage', 'shop', 'store', 'تجمعات', 'exam'])

- university ==> 625 rows
- prison ==> 203 rows
- marriage ==> 24 rows
- shop ==> 367 rows
- store ==> 265 rows
- تجمعات ==> 3 rows
- exam ==> 731 rows


In [63]:
tags = df.tags.str.split(';').explode().reset_index(drop=True)

In [101]:
freq = tags.value_counts().rename_axis('tag').reset_index(name='counts')
freq

Unnamed: 0,tag,counts
0,#كورونا_لبنان,14991
1,#كورونا,12723
2,#لبنان,7550
3,none,7398
4,#Lebanon,4597
...,...,...
16856,#twitchstreamer,1
16857,#reconnaissance,1
16858,#BourjHammoud,1
16859,#بوكاشيو,1


In [95]:
filter_keywords = ['corona', 'كورونا', 'covid', 'lebanon', 'لبنان', 'none', 'خليك_بالبيت', 'خليك_في_البيت', 'healthcare', 'job', 'صباح_الخير', 'medical', 'كوفيد', 'jordan', 'stayhome', 'beirut', 'الصحة', 'بيروت', 'لا_داعي_للهلع', 'الحجر_المنزلي', 'كلن_يعني_كلن', 'dubai', 'health', 'uae', 'حسان_دياب', 'cannabis', 'بلا_مخ']

first = 1
for keyword in filter_keywords:
    if first:
        filter_condition = freq.tag.str.lower().str.contains(keyword.lower())
        first = 0
    else:
        filter_condition |= freq.tag.str.lower().str.contains(keyword) 
    
filtered_freq = freq[~filter_condition].reset_index(drop=True)
filtered_freq

Unnamed: 0,tag,count
0,#Syria,285
1,#Doctor,267
2,#صار_الوقت,251
3,#Doctors,242
4,#center,238
...,...,...
14848,#twitchstreamer,1
14849,#reconnaissance,1
14850,#BourjHammoud,1
14851,#بوكاشيو,1


In [108]:
filtered_freq[(filtered_freq['count'] >= 100) & (filtered_freq['count'] <= 200)]

Unnamed: 0,tag,count
13,#refugees,195
14,#ما_تستهتر,188
15,#UK,185
16,#حزب_الله,183
17,#medtwitter,182
18,#StayAtHome,171
19,#marijuana,170
20,#staysafe,170
21,#hiring,167
22,#وطنك_بيتك,150


In [111]:
filtered_freq[filtered_freq.tag.str.contains('امتحان')]

Unnamed: 0,tag,count
187,#امتحانات_الموت,41
2087,#مقاطعة_امتحانات_الموت,4
3028,#لا_لاجراء_امتحانات_رسمية,3
5219,#الامتحانات_الرسمية,2
10781,#الغوا_الامتحانات_او_استقبلونا_زوار,1
10841,#الغاء_الامتحانات_الرسمية,1
11452,#امتحانات,1
11969,#امتحانات_مش_عاملين,1


# 5. Hotspots Words 3

In [18]:
df = pd.read_csv('all.csv', header=0)

In [25]:
word = 'سجن'
mask = df.text.str.lower().str.contains(word)

In [26]:
mask.value_counts()

False    51612
True        71
Name: text, dtype: int64

In [27]:
df[mask]

Unnamed: 0,username,date,text,tags,translated_text,sentiment,location,lat,long,sentiment_label,KADAA_ID,KADAA_AR,KADAA_EN,MOHAFAZA_ID,MOHAFAZA_AR,MOHAFAZA_EN
271,HANZALAH حنظلة,2020-06-16,#كورونا ليس وباء ولا مرض.. إنه تمرين كاذب لترو...,#كورونا,# Corona is neither a pandemic nor a disease ....,-0.120076,ولا,34.5350,36.1814,Negative,11000,عكّار,Akkar,1,عكار,Akkar
1016,Toni Bayeh طوني البايع,2020-04-01,اثار الحلاقة + اثار الحجر الصحي = نتيجة ما ط...,#حجر_صحي;#تابعونا;#حلاقة_ذقن;#ذقن;#تيك;#سجون;#...,Triggering shaving + Effects of quarantine M...,0.100000,الرمول,34.4331,35.8175,Positive,71000,طرابلس,Tripoli,7,الشمال,North
1069,Dany Diab • داني دياب,2020-03-30,لا إصابات ب #كورونا في سجن رومية، حتّى الوثيقة...,#لبنان;#كورونا;#Covid_19,Corona has no injuries in Roumieh prison even ...,0.000000,الغابات,34.0822,35.8733,Neutral,56000,جبيل,Jbeil,5,جبل لبنان,Mount Lebanon
1401,Fady Khatib,2020-03-21,@Hassan_B_Diab الاردن منع التجول كلياً تحت رقا...,#كورونا,@Hassan_B_Diab Jordan has completely prohibit...,0.100000,العلالي,34.2049,35.7722,Positive,74000,البترون,El Batroun,7,الشمال,North
1551,Tammam Nakkash تمام نقاش,2020-03-17,ايران تطلب قرضا من #IMF بقيمة 5 بليون $ لمواجه...,#بومبيو;#عيب;#يران;#ايران;#العميل_الفاخوري;#كو...,Iran requests a $ 5 billion #IMF loan to meet ...,-0.033333,النشبة,33.4153,35.7472,Negative,64000,حاصبيّا,Hasbaya,6,نبطية,Nabatiyeh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47863,salehaboushaker,2020-03-07,مش بس مات الحلم مات كل شي منحبو بهيدا البلد اص...,#لبنان_يثور;#لبنان;#تعليق_الدراسة;#كورونا_لبنان,"Not, but the dream died, everything died. We l...",0.150000,بيروت,33.8719,35.5097,Positive,31000,بيروت,Beirut,3,بيروت,Beirut
48004,aboufir53160561,2020-03-05,"اربع اصابات بفيروس كورونا بسجن رومية""مبنى د""!و...",,"Four cases of Corona virus in Roumieh prison ""...",0.000000,بيروت,33.8719,35.5097,Neutral,31000,بيروت,Beirut,3,بيروت,Beirut
48591,tareklebanon1,2020-02-26,"المعارضة الإيرانية: انتشار "" #كورونا في سجني إ...",#كورونا,"The Iranian opposition: The spread of ""# Coron...",-0.433333,الشربينة,34.1178,35.8597,Negative,56000,جبيل,Jbeil,5,جبل لبنان,Mount Lebanon
48624,yorgoelbittar,2020-02-25,النائب الإصلاحي عن مدينة طهران محمود صادقي يعل...,,"The reformist deputy for the city of Tehran, M...",-0.100000,ايا,34.2403,35.7822,Negative,74000,البترون,El Batroun,7,الشمال,North
