# Imports

In [71]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import json

# config

In [72]:
needed_months_data = ['08', '09']

# Data

In [73]:
df = pd.read_csv('../tweets_data.csv', header=0)
df.head()

Unnamed: 0,username,date,text,tags,translated_text,location,latitude,longitude,KADAA_ID,KADAA_AR,KADAA_EN,MOHAFAZA_ID,MOHAFAZA_AR,MOHAFAZA_EN
0,Fouad Badaro,2020-09-05,#Corona #COVIDー19 Official Numbers in Lebanon ...,"['#Corona', '#كورونا_لبنان', '#كورونا', '#Stay...",#Corona #COVIDー19 Official Numbers in Lebanon ...,بيروت,33.8719,35.5097,31000,بيروت,Beirut,3,بيروت,Beirut
1,الكابتين,2020-09-05,#كورونا_لبنان \n#coronavirus,"['#كورونا_لبنان', '#coronavirus']",# Corona_Lebanon\n#coronavirus,بيروت,33.8719,35.5097,31000,بيروت,Beirut,3,بيروت,Beirut
2,سفير الشمال,2020-09-05,إدارة منتجع “البالما” توضح.. ماذا قالت عن إصاب...,"['#لبنان', '#كورونا']","The management of the ""Palma"" resort explains ...",ارة,33.5403,35.38,81000,صيدا,Saida,8,الجنوب,South
3,Jihad,2020-09-05,عاجل: وزارة الصحة اللبنانية: 415 إصابة كورونا...,['#كورونا_لبنان'],Urgent: The Lebanese Ministry of Health: 415 C...,حالات,34.0828,35.6541,56000,جبيل,Jbeil,5,جبل لبنان,Mount Lebanon
4,Ali H. Merii,2020-09-05,أخيرا بكرا ٧ ايلول بتخلص التعبئة العامة وبترجع...,['#كورونا_لبنان'],"Finally, the 7 of September gets rid of the ge...",المشحاة,34.05,35.725,55000,كسروان,Kesserouane,5,جبل لبنان,Mount Lebanon


In [74]:
df[['year', 'month', 'day']] = df.date.str.split('-', expand=True)

# Functions

In [75]:
def tf_idf(data, stopwords=None, max_features=1000, ngram=(2, 3)):
    tf_Idf = TfidfVectorizer(stop_words=stopwords, max_features=max_features, ngram_range=ngram)
    tf_Idf_fit = tf_Idf.fit_transform(data)
    feature_names = tf_Idf.get_feature_names()
    
    dense = tf_Idf_fit.todense()
    denselist = dense.tolist()

    tf_idf_table = pd.DataFrame(denselist, columns=feature_names)

    tf_idf_result = pd.DataFrame()
    tf_idf_result[['topic', 'score']] = tf_idf_table.mean().reset_index()
    tf_idf_result = tf_idf_result.sort_values('score', ascending=False).reset_index(drop=True)
    return tf_idf_result
        
not_accepted = re.compile(r'\w*\d+\w*|http|twitter|%|com')  # numeric & non alphanumeric
def accepted_word(w):
    return len(w) > 2 and re.search(not_accepted, w) is None

# Filter Data

In [76]:
data = df[df.month.isin(needed_months_data)].text

In [77]:
data_removed = data.apply(lambda s: " ".join([w for w in s.split() if not accepted_word(w)]))
stopwords = " ".join(data_removed).split()

stopwords += ['d8', 'd9', '08', '83', '86', 'a7', '84', 'a8', 'b1', '88', 'http', 'https', 'twitter', 'com', '']

with open('..\\..\\stop-words\\english.txt', encoding='utf-8') as f:
    stopwords += f.read().split('\n')

with open('..\\..\\stop-words\\arabic.txt', encoding='utf-8') as f:
    stopwords += f.read().split('\n')

In [78]:
len(stopwords)

26289

# get mo7afazat topics

In [79]:
mo7afazat = df.groupby(['MOHAFAZA_ID', 'MOHAFAZA_AR', 'MOHAFAZA_EN', 'month'])

mo7afazat_topics = {}

for g_key, g_data in mo7afazat:
    id_, ar, en, m = g_key
    if m in needed_months_data:
        topics_bigram = tf_idf(g_data.text, stopwords=stopwords, ngram=(2, 2)).topic.loc[:25]
        topics_trigram = tf_idf(g_data.text, stopwords=stopwords, ngram=(3, 3)).topic.loc[:25]
        
        mo7afazat_topics["_".join([str(id_), ar, en, m])] = list(topics_bigram) + list(topics_trigram)
        print(ar, 'Topics Done!!')

  'stop_words.' % sorted(inconsistent))


عكار Topics Done!!
عكار Topics Done!!
بعلبك - هرمل Topics Done!!
بعلبك - هرمل Topics Done!!
بيروت Topics Done!!
بيروت Topics Done!!
البقاع Topics Done!!
البقاع Topics Done!!
جبل لبنان Topics Done!!
جبل لبنان Topics Done!!
نبطية Topics Done!!
نبطية Topics Done!!
الشمال Topics Done!!
الشمال Topics Done!!
الجنوب Topics Done!!
الجنوب Topics Done!!


In [80]:
with open('mo7afazat_topics.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(mo7afazat_topics, indent=2, ensure_ascii=False))

# get kadaas topics

In [81]:
kadaas = df.groupby(['KADAA_ID', 'KADAA_AR', 'KADAA_EN', 'month'])

kadaas_topics = {}

for g_key, g_data in kadaas:
    id_, ar, en, m = g_key
    if m in needed_months_data:
        topics_bigram = tf_idf(g_data.text, stopwords=stopwords, ngram=(2, 2)).topic.loc[:25]
        topics_trigram = tf_idf(g_data.text, stopwords=stopwords, ngram=(3, 3)).topic.loc[:25]

        kadaas_topics["_".join([str(id_), ar, en, m])] = list(topics_bigram) + list(topics_trigram)
        print(ar, 'Topics Done!!')

عكّار Topics Done!!
عكّار Topics Done!!
الهرمل Topics Done!!
الهرمل Topics Done!!
بعلبك Topics Done!!
بعلبك Topics Done!!
بيروت Topics Done!!
بيروت Topics Done!!
زحلة Topics Done!!
زحلة Topics Done!!
البقاع الغربي Topics Done!!
راشيّا Topics Done!!
راشيّا Topics Done!!
بعبدا Topics Done!!
بعبدا Topics Done!!
المتن Topics Done!!
المتن Topics Done!!
الشّوف Topics Done!!
الشّوف Topics Done!!
عاليه Topics Done!!
عاليه Topics Done!!
كسروان Topics Done!!
كسروان Topics Done!!
جبيل Topics Done!!
جبيل Topics Done!!
النبطيّة Topics Done!!
النبطيّة Topics Done!!
بنت جبيل Topics Done!!
بنت جبيل Topics Done!!
مرجعيون Topics Done!!
مرجعيون Topics Done!!
حاصبيّا Topics Done!!
حاصبيّا Topics Done!!
طرابلس Topics Done!!
طرابلس Topics Done!!
الكورة Topics Done!!
الكورة Topics Done!!
زغرتا Topics Done!!
زغرتا Topics Done!!
البترون Topics Done!!
البترون Topics Done!!
بشرّي Topics Done!!
بشرّي Topics Done!!
المنية-الضنّية Topics Done!!
المنية-الضنّية Topics Done!!
صيدا Topics Done!!
صيدا Topics Done!!
جزّي

In [82]:
with open('kadaas_topics.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(kadaas_topics, indent=2, ensure_ascii=False))