# Imports

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# 1. Load Data

In [2]:
df = pd.read_csv('all.csv', header=0)

# 2. Pre-Processing
remove unnecessary stuff

## 2.1 Remove:
- Numbers
- Words of Length <= 2

In [3]:
numeric = re.compile(r'\w*\d+\w*')
def accepted_word(w):
    return len(w) > 2 and re.search(numeric, w) is None

In [4]:
data = df.text.apply(lambda t: " ".join([w for w in t.split() if accepted_word(w)]))

In [5]:
def accepted(t):
    words = t.split()
    for w in words:
        if not accepted_word(w):
            return 'not accepted'
    return 'accepted'

data.apply(lambda t: accepted(t)).value_counts()

accepted    51683
Name: text, dtype: int64

# 3. TF-IDF
A pre TF-IDF to get the first batch of frequent topics and filter them later by mining their stopwords

In [6]:
# TF-IDF Transformer
tf_Idf = TfidfVectorizer()

# Transform & Fit Data
tf_Idf_fit = tf_Idf.fit_transform(data)

# Get Features (Columns)
feature_names = tf_Idf.get_feature_names()

In [7]:
len(feature_names)

107234

In [8]:
words = pd.Series(feature_names)

# 4. Mine Stopwords

In [9]:
stopwords = []

## 4.1 Words of length = 2 are normally unnecessary

In [10]:
_2_char_words = words[words.str.len() == 2]
_2_char_words.count()

810

In [11]:
for w in _2_char_words:
    print(w)

__
_ن
ab
ac
ad
ae
af
ag
ai
al
am
an
ap
ar
as
at
au
av
aw
ba
bc
be
bg
bi
br
bs
bt
by
bz
ca
cc
cd
ce
cf
ch
cj
ck
cl
co
ct
cu
cv
cw
da
de
dh
di
dm
do
dr
du
dw
dé
eb
ed
eh
el
em
en
ep
er
es
et
eu
ex
ey
fb
fd
fe
ff
fh
fi
fl
fm
fo
fp
fr
ft
fu
ga
gb
gd
gf
gg
gi
gl
gm
go
gp
gs
ha
hd
he
hh
hi
hk
hm
hr
ht
ic
id
ie
if
ig
ii
il
im
in
io
ip
iq
ir
is
it
iç
ja
jo
jr
ke
ki
km
ks
kw
ky
la
lb
lc
ld
le
li
ll
ln
lr
lt
lu
ly
ma
mc
md
me
mf
mi
mj
ml
mn
mo
mp
mr
ms
mt
my
na
nc
nd
ne
ng
nh
ni
nj
no
ns
nt
nu
nw
ny
nz
ob
oc
of
oh
ok
om
on
op
or
ot
ou
ow
oz
où
pa
pd
pe
ph
pk
pl
pm
pr
ps
pt
qc
qs
qt
qu
rb
rd
re
rh
ri
rm
rn
rp
rs
rt
sa
sc
se
sf
sg
sh
si
so
sp
sq
sr
st
su
sw
sx
tb
tc
te
tf
th
ti
tk
tl
tn
to
tr
tu
tv
tw
tz
ua
ui
uk
ul
un
up
us
va
vb
vc
ve
vf
vi
vm
vp
vs
vt
wa
wb
we
wk
wo
ws
ya
yi
yo
za
ze
ım
آب
آت
آخ
آس
آع
آل
آم
آن
آه
آي
أب
أث
أج
أح
أد
أر
أز
أس
أك
أل
أم
أن
أو
أي
ؤك
ؤم
إت
إد
إذ
إس
إل
إم
إن
إي
ئب
ئس
اء
اأ
اؤ
ائ
اب
اة
ات
اج
اح
اخ
اد
اذ
ار
از
اس
اص
اط
اع
اغ
اف
اق
اك
ال
ام
ان
اه
او
اي
بأ
بئ
با
بة
بت
بث
ب

In [12]:
stopwords += list(_2_char_words)
len(stopwords)

810

## 4.2 Check words of length = 3  

In [13]:
_3_char_words = words[words.str.len() == 3]
_3_char_words.count()

4970

In [14]:
for w in _3_char_words:
    print(w)

_rn
_عم
_مع
aac
aal
aam
aan
abc
abd
abe
abi
abl
abp
abs
abt
abu
aca
acc
ace
ach
aci
acl
acm
aco
acp
acs
act
add
ade
ads
adt
adı
aec
aed
aei
aeo
afo
afp
aft
age
ago
aha
aid
aim
ain
air
ait
aja
aje
aka
akh
akl
ako
ale
alf
ali
all
alo
alp
als
alt
aly
ama
amb
amd
ami
amo
amp
amr
ana
and
ang
ann
ano
ans
anu
anw
any
aon
aot
apa
app
apr
apt
ara
arc
are
ark
arm
art
arw
arz
asa
asc
ash
ask
asp
ass
así
atc
atm
att
atv
atx
aub
auc
aue
auf
aug
auh
auj
aus
aux
avd
ave
awe
axe
aya
aye
azm
baa
bab
bac
bad
bae
bag
ban
bar
bas
bat
bau
bay
bbc
bbq
bcg
bcl
bcp
bcs
bcz
bda
bde
bdi
bdl
bds
beb
bec
bed
beg
bei
bek
bel
ben
bet
bey
bge
bhe
bho
bhw
bid
big
bil
bin
bio
bir
bit
biz
bkl
bla
blc
blg
blm
bln
bls
blu
bmc
bmd
bme
bmi
bmj
bmt
bob
boo
bot
bou
bow
box
boy
bpa
bpi
bra
brb
bre
bro
brs
bsa
bsc
bsn
bso
bss
btc
bts
btw
bug
bus
but
buy
bye
bzv
c_h
caf
cam
can
cap
car
cas
cat
cbd
cbm
cbt
cca
ccc
ccm
ccp
ccs
cct
ccu
cda
cdc
cdl
cdn
cds
cee
cen
ceo
ces
cet
cfo
cfr
cgi
chi
chp
chs
chu
cio
cjd
cks
cls
cma
cmc
cme


حقي
حكا
حكة
حكت
حكم
حكن
حكى
حكي
حلا
حلب
حلة
حلت
حلف
حلل
حلم
حلو
حلي
حما
حمد
حمر
حمش
حمص
حمل
حمو
حمى
حنا
حنة
حنت
حنك
حهم
حوا
حوت
حوش
حول
حوى
حيا
حيب
حية
حيث
حيذ
حيز
حيص
حيط
حيف
حيل
حين
حيي
خاذ
خاص
خاض
خاف
خال
خام
خبأ
خبر
خبز
خبص
خبط
خبي
ختل
ختم
خجل
خدع
خدم
خدو
خدي
خذو
خذي
خرا
خرب
خرج
خرق
خرى
خسب
خسر
خشب
خصب
خصص
خصك
خصم
خصن
خصو
خضر
خضع
خضم
خطأ
خطئ
خطا
خطب
خطة
خطر
خطط
خطف
خطه
خطى
خطي
خظر
خفة
خفت
خفض
خفف
خفو
خفي
خلا
خلت
خلص
خلط
خلع
خلف
خلق
خلل
خلو
خلى
خلي
خمر
خمس
خود
خوش
خوض
خوف
خول
خير
خيط
خيف
خيك
خيم
خيه
خيو
خيي
دأب
داء
دات
دار
داع
داف
داق
دام
دان
داي
دبة
دبت
دبر
دبس
دبي
دجل
دخل
دخن
درب
درس
درك
دعا
دعت
دعس
دعم
دعي
دفة
دفع
دفن
دفي
دقة
دقت
دقق
دقن
دقه
دقو
دلع
دلو
دلي
دمت
دمج
دمر
دمع
دمه
دمي
دنا
دنب
دنم
دني
دها
دهب
دهم
دهن
دوا
دوب
دوة
دود
دور
دوز
دول
دوم
دون
ديب
ديد
دير
ديل
دين
ديو
ذئب
ذاب
ذات
ذاق
ذاك
ذبح
ذبي
ذرة
ذعر
ذقن
ذكر
ذكى
ذكي
ذلا
ذلة
ذلك
ذمة
ذمت
ذنب
ذها
ذهب
ذوا
ذوق
ذوي
ذيل
ذين
رأس
رأى
رأي
رؤس
رئة
رئس
راء
رات
راح
راد
راس
رام
ران
راي
ربا
ربة
ربح
ربط
ربع
ربك
ربن
ربه
ربو
ربى
ربي
رتب
رتش


## *Observation*:
### `Some words are useful while most are not`

# 5. Add external stopwords

In [15]:
with open('../stop-words/arabic.txt', 'r', encoding='utf-8') as f:
    stopwords += f.read().split('\n')

with open('../stop-words/english.txt', 'r', encoding='utf-8') as f:
    stopwords += f.read().split('\n')

# 6. Group Data & Apply TF-IDF with our new stopwords

In [25]:
df.text = data
mo7afazat = df.groupby('MOHAFAZA_AR')

In [29]:
for name_ar, mo7afaza_df in mo7afazat:    
    # TF-IDF Transformer
    tf_Idf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 1))

    # Transform & Fit Data
    tf_Idf_fit = tf_Idf.fit_transform(mo7afaza_df.text)

    # Get Features (Columns)
    feature_names = tf_Idf.get_feature_names()

    # # Transform the Sparse Matrix
    dense = tf_Idf_fit.todense()
    denselist = dense.tolist()
    
    break

In [30]:
len(feature_names)

10239

In [32]:
tf_idf_table = pd.DataFrame(denselist, columns=feature_names)
tf_idf_table

Unnamed: 0,_الأمر,_سكرونا,aal,aalam,aam,aan,aana,aaronrichterman,abbot,abbott,...,يوقع,يول,يولول,يوما,يومها,يومهم,يوميا,يومين,ڤايروس,ڤيروس
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
result = pd.DataFrame()
result[['word', 'rank']] = tf_idf_table.mean().reset_index()
result = result.sort_values('rank', ascending=False).reset_index(drop=True)
result

Unnamed: 0,word,rank
0,كورونا,0.038529
1,كورونا_لبنان,0.033408
2,لبنان,0.028936
3,corona,0.019971
4,lebanon,0.017365
...,...,...
10234,batal,0.000078
10235,laeno,0.000078
10236,eshab,0.000078
10237,broh,0.000078
