In [3]:
import pandas as pd
# import pyspark.pandas as ps
import glob
import re
import string
import json
from tqdm import tqdm
from datetime import datetime
import os
import emoji
import igraph as ig

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import spacy
from spacy.lang.de.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

2024-06-29 15:42:11.432441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [33]:
ins_data = pd.read_csv('ins_data.csv')
ins_en = ins_data[ins_data['languageCode'] == 'en']
ins_en = ins_en.dropna(subset=['description'])
print(ins_en.shape)
ins_en.head(3)

(50752, 8)


Unnamed: 0,platformId,id,date,updated,type,languageCode,description,postUrl
2,3396383103500594894_3074529058,11435815|3396383103500594894,2024-06-22 23:53:58,2024-06-25 18:04:22,photo,en,"Israeli strikes kill at least 42 in Gaza, encl...",https://www.instagram.com/p/C8iYHAVSq7O/
3,3396382676744653258_5458166344,12755625|3396382676744653258,2024-06-22 23:53:15,2024-06-28 01:38:34,video,en,OT7 Quanny was spotted with Donald Trump in Ph...,https://www.instagram.com/p/C8iYAy4p8nK/
6,3396373958294098270_40015109842,21034332|3396373958294098270,2024-06-22 23:35:48,2024-06-25 13:36:41,photo,en,An opinion poll conducted by the Israeli Chann...,https://www.instagram.com/p/C8iWB7MgdFe/


In [8]:
# use GPU
import torch
# print(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
# !nvidia-smi
device = torch.device('mps')
print(device)

mps


In [26]:
def find_hashtags(text):
    return re.findall(r"#[\w]+", text)

def find_mentions(text):
    return re.findall(r"@[a-zA-Z0-9_.-]+", text)

def find_urls(text):
    return re.findall(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", text)

def pre_processing(content):
    # Processing the content using spaCy and replace punctuations with spaces
    punctuations = f"[{re.escape(string.punctuation + '’–—' + '“”')}]"
    content = re.sub(punctuations, " ", content)
    doc = nlp(content)

    tokens = [token.lemma_.lower() #lemmatize
              for token in doc
              if not token.is_stop #remove stopwords
              and not token.is_space #remove multiple spaces
        ]

    tokens = ' '.join(tokens).strip()

    return tokens

In [46]:
# for each description, use CountVectorizer to build a dictionary of word counts, return the dictionary
def get_word_counts(description):
    count_vect = CountVectorizer()
    X = count_vect.fit_transform([description])

    word_counts = {word: X[0, idx] for word, idx in count_vect.vocabulary_.items()}

    return word_counts

In [48]:
def process_row(row):
    clean_description = pre_processing(row['description'])
    hashtags = find_hashtags(row['description'])
    mentions = find_mentions(row['description'])
    urls = find_urls(row['description'])
    word_counts = get_word_counts(clean_description)
    return pd.Series([clean_description, hashtags, mentions, urls, word_counts])

# Use .apply() with axis=1 to process each row across all columns simultaneously
tqdm.pandas(desc='Processing rows')
results = ins_en.progress_apply(process_row, axis=1)
results.columns = ['clean_description', 'hashtags', 'mentions', 'urls', 'word_counts']

# Concatenate results back to the original DataFrame
ins_en_processed = pd.concat([ins_en, results], axis=1)


Processing rows: 100%|██████████| 50752/50752 [12:19<00:00, 68.63it/s] 


In [96]:
ins_en_processed.reset_index(inplace=True, drop=True)
ins_en_processed.to_csv('ins_en_processed.csv', index=False)

In [23]:
# for idx, row in tqdm(ins_en.iterrows(), total=ins_en.shape[0]):
#     ins_en.at[idx, 'clean_description'] = pre_processing(row.description)
#     ins_en.at[idx, 'hashtags'] = find_hashtags(row.description)
#     ins_en.at[idx, 'mentions'] = find_mentions(row.description)
#     ins_en.at[idx, 'urls'] = find_urls(row.description)
#     ins_en.at[idx, 'word_counts'] = get_word_counts(ins_en.at[idx, 'clean_description'])

  0%|          | 0/50758 [00:00<?, ?it/s]

2
Israeli strikes kill at least 42 in Gaza, enclave’s government media office says @sightmagazine1 #IsraelHamasconflict #Israel #Hamas #Gaza #Palestinians #AlShati #AlTuffah 

https://www.sightmagazine.com.au/news/israeli-strikes-kill-at-least-42-in-gaza-enclaves-government-media-office-says/
israeli strike kill 42 gaza enclave s government medium office say sightmagazine1 israelhamasconflict israel hamas gaza palestinians alshati altuffah https www sightmagazine com au news israeli strike kill 42 gaza enclave government medium office say





# Jaccard Similarity

In [2]:
from itertools import combinations, permutations
import math

In [4]:
ins_en_processed = pd.read_csv('ins_en_processed.csv')

In [5]:
def get_jaccard_similarities(combination):
    # get the word_counts for each post
    word_counts1 = eval(ins_en_processed['word_counts'][combination[0]])
    word_counts2 = eval(ins_en_processed['word_counts'][combination[1]])
    # print(word_counts1)
    # print(word_counts2)

    intersection = {word: word_counts1[word] + word_counts2[word] for word in word_counts1 if word in word_counts2}
    union = word_counts1.copy()
    for word, count in word_counts2.items():
        if word in union:
            union[word] += count
        else:
            union[word] = count

    # sum of the values of the intersection and union
    intersection_sum = sum(intersection.values())
    union_sum = sum(union.values())
    # print(intersection_sum, union_sum)

    # calculate the jaccard similarity
    jaccard_sim = intersection_sum / union_sum
    # print(jaccard_sim)
    
    return jaccard_sim

In [10]:
n = len(ins_en_processed)
total_combinations = math.comb(n, 2) 

graph_data = []

for idx, combination in tqdm(enumerate(combinations(ins_en_processed.index, 2)), total = total_combinations, desc='Processing combinations'):
    id1 = ins_en_processed['id'][combination[0]]
    id2 = ins_en_processed['id'][combination[1]]
    jaccard_sim = get_jaccard_similarities(combination)
    graph_data.append((id1, id2, jaccard_sim))

graph_data = pd.DataFrame(graph_data, columns=['id1', 'id2', 'jaccard_sim'])