Finding Co-Occuring Hashtags to Expand Dataset

In [1]:
import pandas as pd
import json
import os
from os import listdir
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy import text


In [2]:
with open("../util/creds.txt", "r") as credsfile:
    username = credsfile.readline().strip()
    password = credsfile.readline().strip()

In [None]:
# for each i, find out which party it applies to
# for each party, get all i and concatenate into a string
# remove messy strings. don't worry about spelling errors.
# then do tfidf

In [3]:
parties = [ "aap", "aimim", "bjp", "congress", "samajwadi"]

In [4]:
# https://pythontic.com/pandas/serialization/postgresql

hashtags_df = []
# Replace the placeholders with your database credentials and connection details
db_url = f'postgresql://{username}:{password}@localhost:5432/moj'

# Create the engine
engine = create_engine(db_url)

dbConnection = engine.connect();

for party in parties:
    query_str = 'SELECT i, tagName FROM hashtags WHERE i IN (SELECT i FROM hashtags WHERE tagName = \'' + party + '\')'
    df = pd.read_sql(text(query_str), dbConnection);
    hashtags_df.append(df)
# pd.set_option('display.expand_frame_repr', False);

# Print the DataFrame

for df in hashtags_df:
    print(len(df))
# print(dataFrame);

# Close the database connection

dbConnection.close();

1233
914
30955
5966
883


In [5]:
for i, df in enumerate(hashtags_df):
    df["party"] = parties[i]    

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
dataset = pd.concat(hashtags_df)
docs = dataset.groupby('party').tagname.apply(lambda x: ' '.join(x)).reset_index()
corpus = docs.tagname
corpus_index = list(docs.party)


In [85]:
# basic term frequency
tf_dataset = dataset.groupby(["party", "tagname"]).count().reset_index()
for party in parties:
    tf = " ".join(list(tf_dataset[tf_dataset["party"] == party].nlargest(10, columns=["i"]).tagname))
    print(f"{party}: {tf}")

aap: aap bjp viral moj congress like politics ravan up BVM
aimim: aimim asaduddinowaisi owaisi trending india moj politics news viral reels
bjp: bjp modi india hindu narendramodi viral trending rss ram TopGiftedVideos
congress: congress rahulgandhi bjp modi trending TrendingOnMoj viral MojBlue TopGiftedVideos india
samajwadi: samajwadi akhileshyadav samajwadiparty akhilesh trending TopGiftedVideos viral moj yadav CricketFever


In [86]:
with open("../clean/tfidf/remove_words.txt") as datafile:
    stop_words = datafile.read().split("\n")

In [87]:
# check tf idf for majlis in aimim 
print(len(corpus[1]))

clean_corpus = []
for i,party in enumerate(parties):
    rm_stop_words = [x for x in corpus[i].split() if x not in stop_words]
    clean_corpus.append(" ".join(rm_stop_words))


8545


In [69]:
# sample tfidf

test_corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]

docs = pd.DataFrame({"sentence": test_corpus})
corpus = docs.sentence
corpus_index = range(3)


In [89]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split())    # include words that are in at least 2 documents 21k to 5.8k words
# vectorizer = TfidfVectorizer(max_df=0.8, stop_words=stop_words, tokenizer=lambda x: x.split())    # include words that are in at least 2 documents 21k to 5.8k words
X = vectorizer.fit_transform(corpus)
print(X.shape)

tfidf_df = pd.DataFrame(X.toarray(), index=corpus_index, columns=vectorizer.get_feature_names_out())
temp = tfidf_df.stack().reset_index()
temp = temp.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
top_tfidf = temp.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
top_tfidf

(5, 4927)




Unnamed: 0,document,term,tfidf
74,aap,aap,0.416916
651,aap,bjp,0.306052
212,aap,amadavad,0.123915
236,aap,anand,0.123915
534,aap,bhagwan,0.123915
795,aap,bvm,0.123915
1101,aap,disk_jockey,0.123915
1103,aap,dj,0.123915
1158,aap,education_system,0.123915
1240,aap,famous,0.123915


In [94]:
tfidf_df["aimim"]

aap          0.006653
aimim        0.688947
bjp          0.002000
congress     0.003740
samajwadi    0.012657
Name: aimim, dtype: float64

In [99]:
len(clean_corpus[corpus_index].split())

28752

In [104]:
0.212369 * 28752 / 518 -1

10.787709436293436

In [101]:
np.e **10.787709436293436

48421.995226010295

In [102]:
6 / 48422

0.0001239106191400603

In [111]:
word = "anand"
corpus_index = 0
num_docs = 5
tf = len([x for x in clean_corpus[corpus_index].split() if x == word]) / len(clean_corpus[corpus_index].split())
 
print(tf)

wc =0
for i in range(num_docs):
    if word in clean_corpus[i].split():
        wc+=1
df = wc / num_docs
idf = np.log(num_docs+1/ df+1) + 1
print(idf)
print(tf*idf)

0.009532062391681109
3.1400661634962708
0.029931306584453185


In [109]:
print(len([x for x in clean_corpus[corpus_index].split() if x == "ram"]))
print(len(clean_corpus[corpus_index].split()))

243
28752


In [106]:
word = "rss"
corpus_index = 2
num_docs = 5
tf = len([x for x in clean_corpus[corpus_index].split() if x == word]) / len(clean_corpus[corpus_index].split())

print(tf)

wc =0
for i in range(num_docs):
    if word in clean_corpus[i].split():
        wc+=1
df = wc / num_docs
idf = np.log(num_docs+1/ df+1) + 1
print(idf)
print(tf*idf)

0.008660267111853089
3.03688192726104
0.026300208677239808


In [39]:
# use the tokenizer to avoid the vectorizer doing weird tokenizing of hindi/other language words 
# https://stackoverflow.com/questions/61654839/passing-non-english-text-in-tfidfvectorizer-of-scikit-learn
vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=lambda x: x.split())    # include words that are in at least 2 documents 21k to 5.8k words
# vectorizer = TfidfVectorizer(max_df=0.8, stop_words=stop_words, tokenizer=lambda x: x.split())    # include words that are in at least 2 documents 21k to 5.8k words
X = vectorizer.fit_transform(corpus)
print(X.shape)

tfidf_df = pd.DataFrame(X.toarray(), index=corpus_index, columns=vectorizer.get_feature_names_out())
temp = tfidf_df.stack().reset_index()
temp = temp.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
top_tfidf = temp.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
top_tfidf



(5, 4927)


Unnamed: 0,document,term,tfidf
74,aap,aap,0.416916
651,aap,bjp,0.306052
212,aap,amadavad,0.123915
236,aap,anand,0.123915
534,aap,bhagwan,0.123915
795,aap,bvm,0.123915
1101,aap,disk_jockey,0.123915
1103,aap,dj,0.123915
1158,aap,education_system,0.123915
1240,aap,famous,0.123915


In [None]:
temp.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(20).to_csv("hashtag_tfidf-top20.csv")

In [132]:
top_tfidf.to_csv("hashtag_tfidf-7-13.csv")

In [133]:
for party in parties:    
    tf = " ".join(list(top_tfidf[top_tfidf["document"] == party].nlargest(10, columns=["tfidf"]).term))
    print(f"{party}: {tf}")

aap: aap amadavad anand bhagwan bvm disk_jockey dj education_system famous festival
aimim: asaduddinowaisi owaisi news akbaruddinowaisi reelsvideos ms_a08 majlis msmotivational shortsvideos muslim
bjp: hindu narendramodi hinduism mahadev ram rss harharmahadev sanatandharma hanuman jaishreeram
congress: rahulgandhi primejourney rahulnahirukega bjpfails narendramodi amazing news incindia godimedia wow
samajwadi: samajwadi akhileshyadav samajwadiparty akhilesh yadav sapa dimpleyadav lucknow real-reel-reel-reels-reels samajwadipartyofficial


In [135]:
top_tfidf[top_tfidf["document"] == 'bjp']

Unnamed: 0,document,term,tfidf
11384,bjp,hindu,0.26825
12684,bjp,narendramodi,0.22126
11397,bjp,hinduism,0.201636
12226,bjp,mahadev,0.197135
13196,bjp,ram,0.186804
13336,bjp,rss,0.186804
11330,bjp,harharmahadev,0.182733
13426,bjp,sanatandharma,0.181833
11301,bjp,hanuman,0.157662
11738,bjp,jaishreeram,0.154673


In [2]:
import pandas as pd
temp = pd.read_json("../../data/processed/flows/summer/modi/modi-7-13.json")
print(temp.post_date_string.min())
print(temp.post_date_string.max())

2023-05-15
2023-07-13


In [6]:
len(temp[temp["post_month"] == 5])

35

### Checking Co-occurrance

In [8]:
dataset.head()

Unnamed: 0,i,tagname,party
0,3086464014,aamaadmiparty,aap
1,3086464014,aap,aap
2,3086464014,arvindkejriwal,aap
3,3086464014,punjab,aap
4,3086464014,viralposts,aap


In [None]:
# co-occurrence
#  

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\archi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import numpy as np
from nltk.tokenize import word_tokenize


def build_co_occurrence_matrix(corpus,window_size):
    #build unique words

    unique_words = set(dataset.tagname.unique())

    # unique_words=set()
    # for text in corpus:
    #     for word in word_tokenize(text):
    #         unique_words.add(word)

    word_search_dict={word:np.zeros(shape=(len(unique_words))) for word in unique_words}
    word_list=list(word_search_dict.keys())  
    for text in corpus:
        text_list=text.split()
        for idx,word in enumerate(text_list):
            #pick word in the size range
            i=max(0,idx-window_size)
            j=min(len(text_list)-1,idx+window_size)
            search=[text_list[idx_] for idx_ in range(i,j+1)]
            search.remove(word)
            for neighbor in search:
                # get neighbor idx in word_search_dict
                nei_idx=word_list.index(neighbor)
                word_search_dict[word][nei_idx]+=1
    return word_search_dict

In [12]:
corpus = docs.tagname
coo_dict=build_co_occurrence_matrix(corpus,window_size=2)
test_co_df = pd.DataFrame(coo_dict,index=coo_dict.keys()).astype('int')
test_co_df["aimim"].nlargest(30)

asaduddinowaisi     46
trending            30
aimim               24
viral               22
moj                 21
owaisi              18
india               13
akbaruddinowaisi    12
akhileshyadav        8
assadudinowaisi      7
asdadudinowaisi      6
msmotivational       6
uttarpradesh         6
short                6
shortsvideos         6
aap                  5
asaduddinowai        5
asaduddin            5
shorts               5
aimim-jindabad       5
reelsvideos          5
akbaruddin           5
politics             5
congress             4
atiqahmed            4
samajwadiparty       4
muslim               4
ambedkar             4
ahir                 3
world                3
Name: aimim, dtype: int32

In [13]:
test_co_df["yadav"].nlargest(30)

samajwadi               6
uttarpradesh            6
akhilesh                5
viral                   4
india                   4
yadavbrand              4
yadavji                 4
yaduvanshi              3
sp                      2
ahir                    2
vanchitbahujanaghadi    2
bahujansamajparty       2
upelection              2
bahujan                 2
politics                2
pawar                   1
-party-party            1
uttarakhand             1
ahirregiment            1
akhileshyadav           1
TrendingOnMoj           1
dimpleyadav             1
sapa                    1
cmakhileshyadav         1
aimim                   1
party                   1
viralreels              1
Bhaijaan                1
HitWicket               1
attitude                1
Name: yadav, dtype: int32

In [38]:
len(dataset[(dataset["tagname"] == "modi")])

692

In [34]:
final_co_df = pd.DataFrame()
for party in parties:
    temp = test_co_df[party].nlargest(35)

    final_co_df[f"{party}_hashtag"] = temp.index
    final_co_df[f"{party}_count"] = temp.values
final_co_df.to_csv("cooccurence_counts.csv")
final_co_df

Unnamed: 0,aap_hashtag,aap_count,aimim_hashtag,aimim_count,bjp_hashtag,bjp_count,congress_hashtag,congress_count,samajwadi_hashtag,samajwadi_count
0,bjp,60,asaduddinowaisi,46,modi,514,rahulgandhi,272,samajwadi,36
1,congress,21,trending,30,congress,240,bjp,240,samajwadiparty,29
2,viral,16,aimim,24,hanuman,206,narendramodi,94,akhileshyadav,18
3,up,12,viral,22,viral,195,modi,86,akhilesh,17
4,amadavad,11,moj,21,moj,187,godimedia,63,TopGiftedVideos,16
5,delhi,11,owaisi,18,india,172,trending,63,moj,11
6,gujarat,11,india,13,politics,157,moj,51,lucknow,11
7,festival,11,akbaruddinowaisi,12,news,136,election,45,congress,9
8,anand,11,akhileshyadav,8,yogi,127,primejourney,45,PartyTime,8
9,arvindkejriwal,11,assadudinowaisi,7,krishna,113,amazing,44,reels,8
