In [19]:
%reset

In [21]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import umap # dimensionality reduction
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import hdbscan
from deep_translator import GoogleTranslator

# Preparing Dataset

In [70]:
dataset=pd.read_csv("C:/Users/rbw19/Downloads/dataset.csv",on_bad_lines='skip')
ref=dataset
dataset=dataset[["title",'category_id','contents',"agency_id",'tweet_text']]
dataset.head(5)

Unnamed: 0,title,category_id,contents,agency_id,tweet_text
0,سمو ولي العهد يغادر مصر بعد المشاركة في مؤتمري...,أخبار محلية,\n بحفظ الل...,الوطن,
1,"\r\n ""التأم...",أخبار محلية,\n\n\n\n\n\n \n\n\n,المؤسسة العامة للتأمينات الاجتماعية,
2,بسم اللّه الرحمن الرحيم,أخبار محلية,,جهاز متابعة الأداء الحكومي,
3,مجلس الوزراء يعتمد مراسيم تعيين قياديين,أخبار محلية,\n (كونا) -...,الوطن,
4,البيدان لـ «الوطن»: المنشآت وتطوير القوانين وم...,أخبار محلية,\n اعرب مدي...,الوطن,


In [27]:
dataset.shape

(1421, 5)

In [28]:
dataset.fillna("",inplace=True)

In [29]:
exp='[\u0627-\u064aA-Za-z]+' #re to get arabic and english , i had eliminated numbers and special characters because they dont have much influence 
dataset["title"]=dataset["title"].apply(lambda data : " ".join(re.findall(exp,data)))
dataset['category_id']=dataset["category_id"].apply(lambda data: " ".join(re.findall(exp,data)) )
dataset["contents"]=dataset['contents'].apply(lambda data : " ".join(re.findall(exp,data)))
dataset["agency_id"]=dataset["agency_id"].apply(lambda data : " ".join(re.findall(exp,data)))
dataset["tweet_text"]=dataset["tweet_text"].apply(lambda data: " ".join(re.findall(exp,data)))


In [30]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    # text = text.lower()
    # text = text.replace('x', '')
    return text

dataset["title"]=dataset["title"].apply(lambda data:cleanText(data))
dataset["contents"]=dataset["contents"].apply(lambda data:cleanText(data))
dataset["category_id"]=dataset["category_id"].apply(lambda data:cleanText(data))
dataset["agency_id"]=dataset["agency_id"].apply(lambda data:cleanText(data))
dataset["tweet_text"]=dataset["tweet_text"].apply(lambda data:cleanText(data))

In [58]:
dataset

Unnamed: 0,title,category_id,contents,agency_id,tweet_text
0,سمو ولي العهد يغادر مصر بعد المشاركة في م تمري...,خبار محلية,بحفظ الله ورعايته غادر ممثل حضرة صاحب السمو مي...,الوطن,
1,الت مينات تشارك في الاجتماع الحادي والعشرين لر...,خبار محلية,,الم سسة العامة للت مينات الاجتماعية,
2,بسم الل ه الرحمن الرحيم,خبار محلية,,جهاز متابعة ال دا الحكومي,
3,مجلس الوزرا يعتمد مراسيم تعيين قياديين,خبار محلية,كونا علن نا ب ر يس مجلس الوزرا ووزير الدولة لش...,الوطن,
4,البيدان لـ الوطن المنش ت وتطوير القوانين ومعال...,خبار محلية,اعرب مدير عام هي ة الشباب والرياضة يوسف البيدا...,الوطن,
...,...,...,...,...,...
1416,التقدم العلمي تعزز التعاون ال كاديمي والبحثي ب...,خبار العلوم,زار دولة الكويت وفد من جامعة كاليفورنيا بيركلي...,م سسة الكويت للتقدم العلمي,
1417,استكشاف الفضا ل لهام ال جيال القادمة,خبار العلوم,خاطب الحضور من الناش ة والشباب الذين مل وا قاع...,م سسة الكويت للتقدم العلمي,
1418,م سسة الكويت للتقدم العلمي في معرض الشارقة الد...,خبار العلوم,تشارك م سسة الكويت للتقدم العلمي في معرض الشار...,م سسة الكويت للتقدم العلمي,
1419,م سسة التقدم العلمي تعلن سما الفا زين بجوا زها...,خبار العلوم,علنت م سسة الكويت للتقدم العلمي اليوم سما الفا...,م سسة الكويت للتقدم العلمي,


In [31]:
data=dataset["title"]+dataset["contents"]+dataset["category_id"]+dataset["agency_id"]+dataset["tweet_text"]
data=data.to_list()

In [52]:
len(data)

1421

# Vectorizing

In [32]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" # this is already a pretrained model ,so need of saving this model with our data
model = hub.load(module_url)

In [33]:
embedding_gen = model(data)  #

In [34]:
embedding_gen.shape

TensorShape([1421, 512])

# Dimensionality Reduction

In [37]:
umap_embeddings = (umap.UMAP(n_neighbors=10, n_components=20, metric='cosine', 
                                random_state=24).fit_transform(embedding_gen))

In [38]:
print(type(umap_embeddings),'\n',umap_embeddings.shape,'\n',umap_embeddings)
print()

<class 'numpy.ndarray'> 
 (1421, 20) 
 [[10.390603   5.775942   3.3013356 ...  7.584819   7.4700317  6.118715 ]
 [10.9187     4.925012   3.203601  ...  7.6478996  6.8168926  4.6543584]
 [11.351933   4.14504    4.2030516 ...  7.614952   6.926056   4.985949 ]
 ...
 [10.376332   5.4243307  2.6391237 ...  7.6943326  6.542361   4.3580656]
 [10.747433   5.014059   2.963721  ...  7.6186256  6.471488   4.4896865]
 [10.740151   4.914826   2.954909  ...  7.9106426  6.637452   4.191994 ]]



# HDBScan

In [39]:
clusters = hdbscan.HDBSCAN(min_cluster_size = 3,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)
# hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
#                 gen_min_span_tree=False, leaf_size=40,metric='euclidean',
#                 min_cluster_size=5, min_samples=None, p=None)
print(clusters.labels_,len(clusters.labels_))
print(np.unique(clusters.labels_),'\n',len(np.unique(clusters.labels_)))

[ 2 -1 49 ... 22 21 -1] 1421
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98] 
 100


In [43]:
list(clusters.labels_)

[2,
 -1,
 49,
 -1,
 88,
 -1,
 -1,
 -1,
 57,
 4,
 4,
 -1,
 2,
 58,
 48,
 -1,
 73,
 30,
 -1,
 2,
 4,
 -1,
 -1,
 42,
 -1,
 86,
 4,
 -1,
 -1,
 83,
 -1,
 -1,
 48,
 4,
 -1,
 57,
 2,
 71,
 -1,
 -1,
 -1,
 -1,
 -1,
 60,
 2,
 78,
 7,
 -1,
 57,
 -1,
 -1,
 60,
 48,
 50,
 -1,
 70,
 -1,
 48,
 20,
 -1,
 77,
 -1,
 81,
 -1,
 6,
 30,
 26,
 75,
 -1,
 -1,
 64,
 48,
 83,
 79,
 -1,
 59,
 -1,
 -1,
 77,
 26,
 -1,
 54,
 58,
 -1,
 -1,
 2,
 -1,
 -1,
 55,
 20,
 -1,
 -1,
 -1,
 88,
 -1,
 -1,
 26,
 73,
 -1,
 -1,
 65,
 67,
 -1,
 -1,
 -1,
 -1,
 58,
 65,
 28,
 6,
 51,
 65,
 6,
 85,
 -1,
 -1,
 -1,
 -1,
 12,
 6,
 74,
 65,
 66,
 36,
 6,
 65,
 67,
 36,
 -1,
 -1,
 16,
 75,
 -1,
 36,
 -1,
 12,
 73,
 16,
 36,
 -1,
 63,
 36,
 -1,
 67,
 26,
 85,
 20,
 27,
 36,
 -1,
 67,
 -1,
 -1,
 -1,
 12,
 -1,
 -1,
 67,
 51,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 36,
 36,
 16,
 -1,
 52,
 26,
 65,
 29,
 -1,
 -1,
 88,
 19,
 54,
 36,
 6,
 20,
 -1,
 36,
 6,
 69,
 2,
 16,
 52,
 30,
 36,
 -1,
 6,
 -1,
 88,
 16,
 -1,
 41,
 65,
 77,
 -1,
 77,
 51,
 

In [44]:
cluster=[]
for idx in range(len(clusters.labels_)):
    if clusters.labels_[idx]==30: #taking dataponits of 10th cluster
        cluster.append(idx)

In [45]:
news_cluster=[]
for dp in cluster:
    news_cluster.append(GoogleTranslator(source='auto', target='en').translate(data[dp][:200])) #limiting to 200 --> for easy translation
    
news_cluster

['The agency discusses the latest developments of the railway network project His Excellency Sheikh Hamad Mishaal Al Hamad Al-Sabah, the head of the government’s follow-up agency, held a meeting with specialists in the General Authority for Roads and Land Transport to follow up on the latest developments of the network project',
 'Sheikh Hamad Al-Mishal, the head of the apparatus, presents recommendations to the Cabinet of Ministers to raise the level of the government',
 'The Legislative Council is discussing lifting the immunity of MP Muhannad Al-Sayer and a set of proposals for laws.',
 'Two people were injured in a six-way quarrel in Al-Jahra Hospital. Two people were moderately injured as a result of a quarrel that took place at dawn today between six people in front of Al-Jahra Hospital, and I was moved inside. A source said that two quarrels are being caught who escaped with a vehicle. A number was captured.',
 'Two expatriates came to Al-Baad for roaming with two air rifles. Th

In [60]:
# news_cluster=[]
ids=[]
for dp in cluster:
    # news_cluster.append(data[dp]) #limiting to 200 --> for easy translation
    # print(data.index(data[dp]))
    ids.append(data.index(data[dp]))
ids 

[17,
 65,
 190,
 254,
 283,
 298,
 378,
 391,
 514,
 538,
 596,
 730,
 850,
 862,
 890,
 1015,
 1104,
 1113,
 1123]

In [64]:
ref

[['post_id', 'title', 'category_id', 'contents', 'agency_id', 'tweet_text']]

In [117]:
cluster_30=pd.DataFrame(ref.iloc[ids])

In [118]:
def translator(data):
    return GoogleTranslator(source='auto', target='en').translate(data)

cluster_30["category_id"]=cluster_30["category_id"].apply(lambda data : translator(data))

In [120]:
cluster_30.to_csv("cluster_30.csv")

In [77]:
ref.iloc[ids]["category_id"].to_list()

for dp in ref.iloc[ids]["category_id"].to_list():
    print(GoogleTranslator(source='auto', target='en').translate(dp))

local news
local news
National Assembly News
Legal and security news
Legal and security news
Legal and security news
Economic news
Economic news
Art and culture news
Art and culture news
Art and culture news
News, meetings, programs and reports
Local and international news
Local and international news
Local and international news
Education news
community and activities
community and activities
Technology News


In [112]:
pd.Dataframe(ref.iloc[ids][["post_id","post_url","agency_id","contents"]]

Unnamed: 0,post_id,post_url,agency_id,contents
17,41,https://www.gpf.gov.kw/Ar/newsdetails.aspx?id=...,جهاز متابعة الأداء الحكومي,\r\n \n\n\r\n ...
65,206,https://www.gpf.gov.kw/Ar/newsdetails.aspx?id=...,جهاز متابعة الأداء الحكومي,\r\n \n\n\r\n ...
190,625,https://istqlalnews.com/?p=14367,جريدة الإستقلال الإلكترونية,تعقد لجنتان برلمانيتان اجتماعين اليوم الإثنين ...
254,649,https://www.cckwt.cc/%d8%a3%d9%85%d9%86/%d8%a5...,جرائم ومحاكم,أصيب شخصان بجروح متوسطة جراء مشاجرة وقعت فجر ا...
283,1058,https://www.alraqeeb.news/810,‏اخبار الرقيب نيوز,وأكدت الإدارة العامة للعلاقات والإعلام الأمني ...
298,1173,http://www.alhakea.com/word/?p=420614,جريدة الحقيقة,قالت الإدارة العامة للعلاقات والإعلام الأمني ب...
378,637,https://q8times.com/284094.html,كويت تايمز,ارتفع سعر برميل النفط الكويتي 46ر2 دولار ليبلغ...
391,725,https://pressnewskw.com/28655/,برس نيوز,انخفض سعر برميل النفط الكويتي 1.24 دولار ليبلغ...
514,239,http://fn1q8.com/?p=53370,فن ون,سعيدة جدًا بإختياري سفيرة لإكوال سبوتيفاي لشهر...
538,509,http://fn1q8.com/?p=53681,فن ون,كشف نقيب المهن التمثيلية في مصر الفنان د. أشرف...


# Cosine Similarity measeure to verify hdbscan

In [102]:
len(cluster)

19

In [103]:
cluster_30_embed=[]
for point in cluster:
    # print("\n umap_embedding",umap_embeddings[point],"index",data.index(data[point]))
    cluster_30_embed.append(umap_embeddings[point])
# cluster_30_embed
umap_embed_30=pd.DataFrame(cluster_30_embed)

In [105]:
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

In [106]:
similarity_table=cosine_similarity(umap_embed_30)

In [109]:
pd.DataFrame(similarity_table)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,0.999622,0.999954,0.999963,0.999774,0.999925,0.999911,0.999877,0.999789,0.999816,0.999787,0.999687,0.999714,0.999906,0.999756,0.999843,0.999684,0.999762,0.999816
1,0.999622,1.0,0.99943,0.999611,0.999869,0.999755,0.999716,0.999663,0.999862,0.999789,0.999953,0.999932,0.999991,0.999756,0.999877,0.99991,0.999928,0.999725,0.99995
2,0.999954,0.99943,1.0,0.999866,0.999607,0.999851,0.999845,0.999751,0.999654,0.999795,0.999625,0.999555,0.999533,0.999819,0.999577,0.999763,0.999603,0.999786,0.999705
3,0.999963,0.999611,0.999866,1.0,0.999829,0.999926,0.999885,0.999906,0.999839,0.999769,0.999769,0.9997,0.999712,0.999908,0.999833,0.9998,0.999617,0.999668,0.999777
4,0.999774,0.999869,0.999607,0.999829,1.0,0.999936,0.999731,0.999714,0.999912,0.999808,0.999854,0.999934,0.999916,0.999797,0.999938,0.999889,0.999817,0.999738,0.999892
5,0.999925,0.999755,0.999851,0.999926,0.999936,1.0,0.999832,0.99978,0.999879,0.999879,0.999818,0.999861,0.999829,0.999866,0.999865,0.999911,0.999799,0.999841,0.999884
6,0.999911,0.999716,0.999845,0.999885,0.999731,0.999832,1.0,0.999966,0.999889,0.999894,0.999879,0.999752,0.999776,0.999992,0.999842,0.999893,0.999781,0.999794,0.999878
7,0.999877,0.999663,0.999751,0.999906,0.999714,0.99978,0.999966,1.0,0.99987,0.999775,0.99985,0.999675,0.999735,0.999971,0.999852,0.9998,0.99965,0.999622,0.999801
8,0.999789,0.999862,0.999654,0.999839,0.999912,0.999879,0.999889,0.99987,1.0,0.999914,0.99991,0.999932,0.9999,0.999936,0.999987,0.999929,0.999865,0.999802,0.999923
9,0.999816,0.999789,0.999795,0.999769,0.999808,0.999879,0.999894,0.999775,0.999914,1.0,0.999846,0.999899,0.999824,0.999907,0.999846,0.999965,0.99993,0.99997,0.999929
