In [7]:
import os
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import igraph as ig
import fasttext
from huggingface_hub import hf_hub_download
import re
import json
import csv

In [9]:
respath = 'C:\\Users\\barto\\telescrap\\' #homedir
file_name = 'Test' # @param {type:"string"}


In [11]:
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

In [13]:
channels = {'@tiesiogiaiisukrainos': 4, 
                '@varlinas': 5,
                '@volna_lt': 2,
                '@novayaklaipeda': 7,
                '@euromore': 1,
                '@visaginasnews': 7, 
                '@karas_ukrainoje': 4, 
                '@n_aujenoschat': 5, 
                '@sputniknews_lt': 1, 
                '@lithuanian': 3, 
                '@lithuanianlegio': 3, 
                '@lithuanianews24': 2, 
                '@vilnius_lithuania': 7, 
                "@aktyvusklubas2": 3,
                '@rudelfi': 2, 
                '@slava_ukraini_ltu': 4, 
                '@infalt': 5, 
                '@radiorlt': 2,
                '@NorthernFront_NATO': 1,
                '@matricalietuvoje': 6, 
                '@vardantosLietuvos': 3, 
                '@novayaklaipeda': 7, 
                '@litovecrubitpravdu': 1, 
                '@politikai': 5, 
                '@sapereaudelt': 5,
                '@atsibudimas': 5, 
                '@Kritinismastytojas': 5,
                '@n_aujienos': 5, 
                '@neadeqatus': 5, 
                '@hmelisozreli': 1, 
                '@klaipedaonline': 7, 
                '@Jaunieji_Partizanai': 3, 
                '@naujoji_pasaulio_tvarka': 6, 
                '@karas_Z': 1, 
                '@seimusajudistesiasi': 5,
                '@seimusajudis2021': 5, 
                '@komentarastv': 5, 
                '@ArunasGl': 6, 
                '@mkzmedia': 4, 
                '@ValomDangu': 6, 
                '@fak_tai': 5, 
                '@mariusjonaitis': 3, 
                '@Karamzin_branch': 1,
                '@Karas_ukraina_chronologijos': 4, 
                '@karas_ukrainoje_chronologija': 4, 
                '@RFULietuviskai': 4, 
                '@slava_ukraini_ltu': 4, 
                '@Izraeliokaras': 2, 
                '@zingeris': 2, 
                '@neadeqatus': 5, 
                '@ZoroKanalas': 5, 
                '@ekspertaiTelegram': 5, 
                '@lrtlt': 2, 
                '@radiorlt': 2, 
                '@baltnews': 2, 
                '@delfi_lietuva': 2, 
                '@zhiznvlitve': 2, 
                '@nexta_live': 2,  
                #'@SolovievLive': 1,  
                '@rubaltic': 1,
                '@lietuvasu': 1}

categories = {0: "user",
              1: "prorussian",
              2: "informational",
              3: "nationalistic",
              4: "proukrainian",
              5: "other",
              6: "conspiracy",
              7: "local"}
                  

In [15]:
def remove_emojis(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [17]:
def create_peer(peer_type):
    return {
            'type': peer_type,             #type of the peer: channel or user
            'channel name': "None",             #name of the channel
            'channel type': 0,             #type of the channel: 0 if user
            'active in': set(),     #channels and groups where user is active
            'languages': {},        #languages used by channel or peer
            'no of messages': 0,    #number of published messages
            'no of comments': 0,    #number of replies user's posts got
            'no of replies': 0,     #number of replies user gave
            'no of forwards': 0,    #how many times peer forward posts
            'no of forwarded': 0,   #how many times user's posts were forwarded   '
            'no of views': 0,       #how many times posts were viewed
            'no of shares': 0,      #how many times posts were shared
            'no of reactions': 0,   #number of reactions to the posts
           }

In [19]:
def create_edges_metadata(source_target):
    source, target = source_target.split()
    return {
            "source": source,
            "target": target,
            "no of reactions": 0,   #number of reactions in a given edge
            "no of replies": 0,     #number of replies in a given edge
            "no of forwards": 0,    #number of forwards in a given edg
            }

In [21]:
peers_data = dict()
edges_metadata = dict()
edgelist = []

In [23]:
for channel in channels.keys():
    for filename in filter (lambda x: f'complete_{channel}' in x, os.listdir(respath)): 
        df = pd.read_excel(respath + filename)
        for index, row in df.iterrows():
            if pd.isnull(row['Author ID']):
                continue
            post_author = str(row['Author ID'])
            if post_author not in peers_data:
                peers_data[post_author] = create_peer(row['Author type'])
            if row['Author type'] == "PeerChannel":
                peers_data[post_author]['channel type'] = channels[channel]  
                peers_data[post_author]['channel name'] = channel
            peers_data[post_author]['active in'].add(row['Channel'])
            if not pd.isnull(row['Content']):
                text_cleaned = "".join(row['Content'].splitlines())
                text_cleaned = remove_emojis(text_cleaned)
                predlang = model.predict(text_cleaned, k=1)[0][0]
                if predlang not in peers_data[post_author]['languages']:
                     peers_data[post_author]['languages'][predlang] = 0
                peers_data[post_author]['languages'][predlang] += 1       
            peers_data[post_author]['no of messages'] += 1
            peers_data[post_author]['no of views'] += int(row['Views'] if not  pd.isnull(row['Views'])  else 0) 
            peers_data[post_author]['no of shares'] += int(row['Shares'] if not pd.isnull(row['Shares']) else 0)
            peers_data[post_author]['no of reactions'] += sum(int(i) for i in  str(row['Reactions']).split() if i.isdecimal())
            if not pd.isnull(row['Reply to author ID']):
                reply_peer = str(int(row['Reply to author ID']))
                edge = post_author + ' ' + reply_peer
                edgelist.append(edge)
                if edge not in edges_metadata:
                    edges_metadata[edge] = create_edges_metadata(edge)
                edges_metadata[edge]["no of replies"] += 1
                peers_data[post_author]['no of replies'] += 1
                if reply_peer not in peers_data:
                    peers_data[reply_peer] = create_peer(row['Reply to author type'])
                peers_data[reply_peer]['no of comments'] += 1
                peers_data[reply_peer]['active in'].add(row['Channel'])
            if not pd.isnull(row['Forwarded from author ID']):
                fwd_peer = str(int(row['Forwarded from author ID']))
                edge = post_author + ' ' + fwd_peer
                edgelist.append(edge)
                if edge not in edges_metadata:
                    edges_metadata[edge] = create_edges_metadata(edge)
                edges_metadata[edge]["no of forwards"] += 1
                peers_data[post_author]['no of forwards'] += 1
                if fwd_peer not in peers_data:
                    peers_data[fwd_peer] = create_peer(row['Forwarded from author type'])
                peers_data[fwd_peer]['no of forwarded'] += 1
                peers_data[fwd_peer]['active in'].add(row['Channel'])
            if not pd.isnull(row['Reactions IDs']):
                for react_peer in str(row['Reactions IDs']).split():
                    react_peer = str(int(float(react_peer)))
                    edge = react_peer + ' ' + post_author
                    edgelist.append(edge)
                    if edge not in edges_metadata:
                        edges_metadata[edge] = create_edges_metadata(edge)
                    edges_metadata[edge]["no of reactions"] += 1
                    if react_peer not in peers_data:
                        peers_data[react_peer] = create_peer(react_peer)
                    peers_data[react_peer]['active in'].add(row['Channel'])

In [24]:
with open(respath + 'telegraph_peers_data.json', 'w', encoding='utf-8') as f:
    json.dump(peers_data, f, ensure_ascii=False, indent=4, default=list)

In [25]:
with open(respath + 'telegraph_edges_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(edges_metadata, f, ensure_ascii=False, indent=4, default=list)

In [26]:
with open(respath + 'telegraph_edgelist.data', 'w', encoding='utf-8') as f:
    for line in edgelist:
        f.write(f"{line}\n")