In [22]:
import re

import pandas as pd 
import numpy as np
import swifter
import sklearn

print(f"pandas version is: {pd.__version__}\nnumpy version is: {np.__version__}\nsklearn version is {sklearn.__version__}")

pandas version is: 2.3.1
numpy version is: 2.3.1
sklearn version is 1.7.1


In [77]:
def load_and_filter_groups(filepath: str, min_members: int = 200, encoding: str = "utf-8-sig", errors: str = "replace", low_memory: bool = False):
    """
    Load the dataset and filter out groups with no name or low member count
    """
    df = pd.read_csv(filepath)
    df = df[df["peer_name"].notnull()]
    df = df[df["participants"] > min_members]
    df = df.drop_duplicates(subset="peerid")
    return df


def clean_text(text):
    """
    Clean a given text by removing links, phone numbers, mentions, emojis, symbols, and non-Persian/English characters
    """
    if pd.isnull(text):
        return ""

    # Remove links
    text = re.sub(r'http\S+|www\S+|t\.me/\S+|telegram\.me/\S+', '', text)

    # Remove phone numbers
    text = re.sub(r'(\+98|0098|0)?9\d{9}', '', text)

    # Remove @mentions
    text = re.sub(r'@\w+', '', text)

    # Remove emojis
    emoji_pattern = re.compile("["       
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove symbols and punctuation (keep Persian and English characters)
    text = re.sub(r'[^\w\s\u0600-\u06FFa-zA-Z]', ' ', text)

    # Remove meaningless short words
    meaningful_short_words = {'آب', 'فن', 'مه', 'لب'}
    text = ' '.join([w for w in text.split() if len(w) >= 3 or w in meaningful_short_words])

    # Normalize Arabic characters to Persian
    text = text.replace('ي', 'ی').replace('ك', 'ک')

    # Keep only Persian and English words
    text = ' '.join(re.findall(r'[a-zA-Z\u0600-\u06FF]+', text))

    return text.strip()


def is_persian_dominant(text, threshold: float = 0.3):
    """
    Check if the majority of the text contains Persian characters (based on a threshold)
    """
    if not isinstance(text, str) or text.strip() == "":
        return False
    total_chars = len(re.findall(r'\w', text))
    persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
    if total_chars == 0:
        return False
    return (persian_chars / total_chars) >= threshold


def is_really_persian(text):
    """
    Return True if the text contains Persian-specific letters
    and does not contain Arabic-specific characters
    """
    if not isinstance(text, str) or text.strip() == "":
        return False

    # Persian-specific letters
    persian_letters = "پچژگکیی"

    # Arabic-specific letters that should NOT be present
    arabic_only_letters = "ةىيﻻـ"

    # Must contain at least one Persian letter
    has_persian = any(char in text for char in persian_letters)
    # Must not contain any Arabic-specific characters
    has_arabic_only = any(char in text for char in arabic_only_letters)

    return has_persian and not has_arabic_only

def clean_group_data(df):
    df['name_clean'] = df['peer_name'].apply(clean_text)
    df['about_clean'] = df['about'].apply(clean_text)

    # Keep only rows with real Persian (not Arabic) content
    df = df[df['name_clean'].apply(is_really_persian) | df['about_clean'].apply(is_really_persian)]

    # Drop empty or NaN
    df = df.dropna(subset=['name_clean', 'about_clean'])
    df = df[(df['name_clean'].str.strip() != "") & (df['about_clean'].str.strip() != "")]

    return df


filepath = "/mnt/d/Uni/thesis/data/idekav_subscription.csv"
df = load_and_filter_groups(filepath)
df_clean = clean_group_data(df)


In [15]:


# فایل گروه‌هایی که مشخصاتشون رو داریم (تمیز شده)
groups_df  = pd.read_csv("../data/processed/groups_clean.csv")

# فایل عضویت (مثلاً شامل user_id و group_id)
membership_df  = pd.read_csv("/mnt/d/Uni/thesis/data/newsubtact.csv")


In [16]:
valid_group_ids = set(groups_df["peerid"])
filtered_membership = membership_df[membership_df["groupID"].isin(valid_group_ids)]


In [17]:
total_before = membership_df["groupID"].nunique()
total_after = filtered_membership["groupID"].nunique()
print(f"تعداد گروه‌ها قبل از فیلتر: {total_before}")
print(f"تعداد گروه‌ها بعد از فیلتر: {total_after}")
print(f"چند گروه حذف شدند؟ {total_before - total_after}")


تعداد گروه‌ها قبل از فیلتر: 658037
تعداد گروه‌ها بعد از فیلتر: 201249
چند گروه حذف شدند؟ 456788


In [None]:
# filtered_membership.to_csv("../data/processed/membership_filtered.csv", index=False)


In [8]:
# df =pd.read_csv("/mnt/d/Uni/thesis/dissertation/telegram_community_detection/data/processed/groups_clean.csv")

In [20]:
df = pd.read_csv("../data/processed/membership_filtered.csv")


In [23]:
from sklearn.preprocessing import LabelEncoder

group_encoder = LabelEncoder()
user_encoder = LabelEncoder()

group_ids = group_encoder.fit_transform(df["groupID"])
user_ids = user_encoder.fit_transform(df["userID"])


In [24]:
from scipy.sparse import coo_matrix

rows = user_ids
cols = group_ids
data = [1] * len(df)

membership_matrix = coo_matrix((data, (rows, cols)))


In [25]:
group_overlap = membership_matrix.T @ membership_matrix


In [27]:
index_to_groupID = group_encoder.inverse_transform(np.arange(len(group_encoder.classes_)))


In [28]:
# import numpy as np

# min_common_members = 5
# group_overlap.setdiag(0)  # حذف self-loops

# coo = group_overlap.tocoo()
# edges = [
#     (group_encoder.inverse_transform([i])[0],  # real groupID i
#      group_encoder.inverse_transform([j])[0],  # real groupID j
#      w)
#     for i, j, w in zip(coo.row, coo.col, coo.data)
#     if w >= min_common_members
# ]
import numpy as np

min_common_members = 5
group_overlap.setdiag(0)  # حذف self-loops

coo = group_overlap.tocoo()

mask = coo.data >= min_common_members
rows = coo.row[mask]
cols = coo.col[mask]
weights = coo.data[mask]

edges = [
    (int(index_to_groupID[i]), int(index_to_groupID[j]), int(w))
    for i, j, w in zip(rows, cols, weights)
    if i < j  # برای حذف یال‌های تکراری چون گراف بی‌جهت هست
]



In [29]:
import networkx as nx

G = nx.Graph()
G.add_weighted_edges_from(edges)

print(f"📌 تعداد گره‌ها: {G.number_of_nodes()}")
print(f"📌 تعداد یال‌ها: {G.number_of_edges()}")


📌 تعداد گره‌ها: 92679
📌 تعداد یال‌ها: 6629782
