### Attempts at Modeling. All final topic modeling is in the analysis/moj_stm folder.

### Validate + Translate

In [14]:
import pandas as pd
import numpy as np
import regex as re
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy import text
import os
import string

In [45]:
with open("../util/creds.txt", "r") as credsfile:
    username = credsfile.readline().strip()
    password = credsfile.readline().strip()

In [46]:
db_url = f'postgresql://{username}:{password}@localhost:5432/moj'

engine = create_engine(db_url)

dbConnection = engine.connect();

query = text('select * from audio_transcripts')

df = pd.read_sql(query, dbConnection);

dbConnection.close();

Create Inputs for Translation

In [44]:
for page in range(10, 3200, 10):
    with open(f"translations/audio/text_files/text_files_{page}.txt", "w", encoding="utf-8") as outfile:
        for i,caption in enumerate(df.text[page:page+10]):
            outfile.write(f"{caption}\n==\n")

Move Translation Output into CSV

In [30]:
lens = []
for page in range(0, 3280, 10):
    with open(f"translations/audio/text_files/text_files_{page}.txt", "r", encoding="utf-8") as datafile:
        text = datafile.read()
        lens.append(len(text.split("\n,\n")))

In [31]:
pd.Series(lens).value_counts()

10    261
1      57
9      10
dtype: int64

In [52]:
translations = []
ones_index = []
nines_index = []
for page, page_len in enumerate(lens):
    if page_len == 10:
        with open(f"translations/audio/text_files/text_files_{page*10}.txt", "r", encoding="utf-8") as datafile:
            text = datafile.read().split("\n,\n")
            translations.extend(text)
    else:
        translations.extend([""]*10)
        if page_len == 1:
            ones_index.append(page*10)
        else:
            nines_index.append(page*10)

In [82]:
ones_text = []
for page in ones_index:
    with open(f"translations/audio/text_files/text_files_{page}.txt", "r", encoding="utf-8") as datafile:
        text = datafile.read().split("==")
        if len(text) == 10:
            ones_text.append(text)
        else:
            ones_text.append(text[:-1])

In [87]:
for row, page in enumerate(ones_index):
    for i in range(10):
        translations[page+i] = ones_text[row][i]

In [118]:
nines_text = []
for page in nines_index:
    with open(f"translations/audio/text_files/text_files_{page}.txt", "r", encoding="utf-8") as datafile:
        text = datafile.read().split("\n,\n")
        nines_text.append(text)
        
for row, page in enumerate(nines_index):
    for i in range(9):
        translations[page+i+1] = nines_text[row][i]

In [128]:
temp = pd.Series(translations)
print(len(temp[temp == '']))
print(len(temp[temp == '\n,']))
print(len(temp[temp == '\n\n']))

58
6
11


In [135]:
(set(temp[temp == ''].index) - set(df[df.transcript == ""].index) - set(temp[temp == '\n\n'].index) - set(temp[temp == '\n,'].index))

{242, 1459, 2530}

In [None]:
temp[df[df.transcript == ""].index].value_counts()

        55
\n\n    11
\n,      6
dtype: int64

In [None]:
df.to_csv("translations/audio/audio_translations_final.csv")

Clean the Translations

In [27]:
# Clean characters, etc from captions
from cleantext.sklearn import CleanTransformer
# combined = pd.read_csv("translations/audio/audio_translations_final.csv").rename(columns={"translation": "text"})
combined = pd.read_csv("translations/audio/missing_audio_files_5_4.csv")[["filename", "translation"]].fillna("").rename(columns={"translation": "text"})
cleaner = CleanTransformer(no_punct = True, 
                           lower=True, 
                           no_emoji=True, 
                           no_line_breaks=True, 
                           no_urls=True, 
                           normalize_whitespace=True,
                           to_ascii=False)

exp_remove_hashtags = "#+[^\s]+"
exp_remove_mentions = "@+[^\s]+"

cleaned_without_mentions = combined.text.str.replace(exp_remove_mentions, "")
cleaned_without_hashtags = cleaned_without_mentions.str.replace(exp_remove_hashtags, "")
cleaned_final = cleaner.transform(cleaned_without_hashtags)
combined['clean_caption'] = cleaned_final

punct_to_remove = ['$', '+', '<', '=', '>', '^', '`', '|', '~']
# extra cleaning. because these characters were missed for some reason
for punct in punct_to_remove:
    combined.clean_caption = combined.clean_caption.str.replace(punct, '')
combined.clean_caption = combined.clean_caption.str.replace("\s+", ' ')     # normalize white space
combined.clean_caption = combined.clean_caption.str.strip()
combined.reset_index(inplace=True)

# get_caption_lang(combined)

In [143]:
df["translation"] = pd.Series(translations)
combined.to_csv("translations/audio/audio_translations_final_clean.csv")

In [29]:
def remove_non_ascii(text):
    printable = set(string.printable)
    return ''.join(filter(lambda x: x in printable, text))

In [30]:
combined['text_non_ascii'] = combined.clean_caption.apply(remove_non_ascii)

In [182]:
combined.to_csv("translations/audio/audio_translations_final_clean.csv")

### Analysis

Set Up

In [1]:
import pandas as pd
import numpy as np
import regex as re
import os
import string

In [2]:
df = pd.read_csv("translations/audio/audio_translations_final_clean.csv")
df = df[["filename", "transcript", "lang", "text", "text_non_ascii"]].rename(columns={"text_non_ascii": "translation"})

In [3]:
filename_mapping = pd.read_csv("download_csvs/id_filename_mapping.csv")
filename_mapping.merge(df, how="left").filename.isna().value_counts()   # check


False    3221
Name: filename, dtype: int64

In [4]:
df = filename_mapping.merge(df, how="left")

In [6]:
df.translation.isna().value_counts()

False    3108
True      113
Name: translation, dtype: int64

Analysis

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [46]:
data = df.translation.apply(lambda x: str.split(str(x)))
df['no_stopwords'] = data.apply(lambda x: " ".join([str.lower(word) for word in x if word not in stop_words]))

In [47]:
word_counts = df['no_stopwords'].str.split(expand=True).stack().value_counts()

In [12]:
word_counts.sort_values(ascending=False)[30:60]

name      185
want      180
said      178
time      176
india     175
say       172
every     170
hey       167
good      166
tell      164
work      164
world     158
going     158
b         157
bye       157
become    156
made      153
life      152
eat       152
im        152
make      152
love      147
first     147
ram       147
given     141
ke        140
bad       136
father    135
many      131
much      127
dtype: int64

Word Embeddings

In [19]:
from gensim.models import Word2Vec

In [86]:
dataset = df.drop(df[df.no_stopwords == 'nan'].index).no_stopwords.apply(str.split)

In [93]:
model = Word2Vec(sentences=dataset, vector_size=35, window=5, min_count=1, workers=4)

In [102]:
model.wv.most_similar('muslim', topn=20)

[('many', 0.9966540932655334),
 ('water', 0.9957720637321472),
 ('years', 0.9955828189849854),
 ('next', 0.9953979849815369),
 ('said', 0.9951282739639282),
 ('allah', 0.9951192736625671),
 ('three', 0.9949244856834412),
 ('great', 0.9949066042900085),
 ('way', 0.9948316812515259),
 ('keep', 0.9948297142982483),
 ('live', 0.994773268699646),
 ('even', 0.9947198033332825),
 ('person', 0.9946987628936768),
 ('given', 0.9946883320808411),
 ('since', 0.9946839213371277),
 ('last', 0.9946224689483643),
 ('taking', 0.9945462346076965),
 ('problem', 0.9945346713066101),
 ('bjp', 0.9944618940353394),
 ('minister', 0.9944290518760681)]

Topic Model

In [103]:
from collections import defaultdict
from gensim import corpora
from gensim.models import LdaModel
frequency = defaultdict(int)
for text in dataset:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in dataset
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [110]:
lda_model = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.049*"om" + 0.034*"people" + 0.014*"baza" + 0.013*"bada" + 0.011*"aap" + 0.007*"hai" + 0.007*"give" + 0.007*"talk" + 0.006*"b" + 0.005*"police"'),
 (1,
  '0.027*"agar" + 0.013*"b" + 0.009*"ub" + 0.008*"first" + 0.008*"like" + 0.008*"modi" + 0.007*"oh" + 0.007*"ji" + 0.007*"grow" + 0.007*"also"'),
 (2,
  '0.015*"agar" + 0.014*"baad" + 0.014*"go" + 0.009*"hai" + 0.008*"aa" + 0.008*"hey" + 0.007*"day" + 0.007*"modi" + 0.007*"good" + 0.006*"work"'),
 (3,
  '0.023*"aab" + 0.016*"take" + 0.014*"one" + 0.009*"taken" + 0.009*"ada" + 0.009*"bar" + 0.008*"people" + 0.008*"im" + 0.008*"bad" + 0.007*"like"'),
 (4,
  '0.032*"come" + 0.027*"ag" + 0.011*"badan" + 0.011*"dont" + 0.009*"today" + 0.009*"k" + 0.008*"one" + 0.008*"people" + 0.007*"baga" + 0.007*"like"')]

Bertopic

In [None]:
# pip install bertopic


In [118]:
df.no_stopwords.to_csv("translations/audio/bert_input.csv")

In [119]:
df

Unnamed: 0,i,filename,transcript,lang,text,translation,no_stopwords
0,3151895074,6P0OR443POT74XKY1x22TWkZNmPmLySjprbZ,,,,,
1,3133842580,vP4j5223PjTGjVkr9pBBuELplmjmP7sXoW54,"इक लडकी आखे बोले की, हम आप कुच आते हैं हमने क...",hi,"A girl said with her eyes, we come to you, w...",a girl said with her eyes we come to you we co...,girl said eyes come come party come bjps kuch
2,3160379976,wPNGm99VPGTEv5P09kddTRZwjdRDJ4HLjWxZ,"अग वर सारे बही इस वीटियो को देखो, देखो योगी ज...",hi,"Hey everyone, watch this video, see what Yog...",hey everyone watch this video see what yogi ji...,hey everyone watch video see yogi ji saying pl...
3,3142237093,2r7wN33WrwU5Xd0L7PbbIKZxp5wRK7SpBvBo,,,,,
4,3133160363,xPYr5BBLPrTOx6PADjeeuXd0wb9QyRf3Q4NY,,,,,
...,...,...,...,...,...,...,...
3216,3137317224,5rwR5EEnrRUd0X6N2Z11T27NVPJbPOspPGpW,٧ ا Strand a a बीजेपी ने पूरी कोशिष की ती की ...,hi,٧ ا Strand a a BJP tried its best to take th...,strand a a bjp tried its best to take this e...,strand bjp tried best take election national l...
3217,3142020177,xPYr5BBLPrTOx6PADjeeuXd0V63BZRf3xjgY,येडो एरप्पा को अप तु दे एलेक्षन पूरे तरीके से...,hi,Yeddo Erappa was completely kept away from t...,yeddo erappa was completely kept away from the...,yeddo erappa completely kept away party electi...
3218,3141273341,geyK7EEbeKcKLjdR2EkkhLGlg716oOhOLdRn,करनातक सरकार में मंत्रियो और कुंगरे सद्छ मलिक...,hi,Karnataka government ministers and Kungre Sa...,karnataka government ministers and kungre sadh...,karnataka government ministers kungre sadh mal...
3219,3142371153,11QA4nnK1AHERwnV6AyyTblPJE9nQETN63wR,ब्रहु� .. Decky आप break ब intent पर द सरखार ...,hi,"Brother.. Decky, you broke with the intentio...",brother decky you broke with the intention of ...,brother decky broke intention sarkars wife bha...


In [None]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

Final Topics Models

In [None]:
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
dataset = pd.read_csv("audio_translations_final_labels.csv")
docs = dataset.text_non_ascii.dropna()
more_stopwords = "thank thanks watch like watching subscribe channel video videos".split()
data = docs.apply(lambda x: str.split(str(x)))
cleaned_docs = data.apply(lambda x: " ".join([str.lower(word) for word in x if word not in more_stopwords]))
cleaned_docs = cleaned_docs[cleaned_docs.apply(len) != 0]
final_indices = cleaned_docs.index
docs = cleaned_docs.reset_index()["text_non_ascii"]

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech


In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=141)


hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))


# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

# Show topics
topic_model.get_topic_info()



In [None]:
new_topics = topic_model.reduce_outliers(docs, topics)
topic_model.update_topics(docs, topics=new_topics)
topic_model.get_topic_info()

In [None]:
topic_df = topic_model.get_document_info(docs)

In [None]:
political_docs = topic_df[topic_df.Topic==0].reset_index().Document
len(political_docs)

In [None]:
# WORKS!!
from bertopic.representation import KeyBERTInspired

representation_model = KeyBERTInspired()

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=141)

test_vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model=embedding_model,
                        umap_model=umap_model,
                       vectorizer_model=test_vectorizer_model,
                       representation_model=representation_model)
test_topics,test_probs = topic_model.fit_transform(political_docs)
topic_model.get_topic_info()

In [None]:
new_topics = topic_model.reduce_outliers(political_docs, test_topics)
topic_model.update_topics(political_docs, topics=new_topics)
topic_model.get_topic_info()

Qualitative

In [13]:
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy import text
from moviepy.editor import *
import pandas as pd

In [14]:
with open("creds.txt", "r") as credsfile:
    username = credsfile.readline().strip()
    password = credsfile.readline().strip()
# Get posts from database
db_url = f'postgresql://{username}:{password}@localhost:5432/moj'

engine = create_engine(db_url)

dbConnection = engine.connect();

query = text('select * from \"posts\" where post_month = 6 OR (post_month = 7 and post_day < 15)')

df = pd.read_sql(query, dbConnection);

dbConnection.close();

In [16]:
df.sample(frac=0.1)

Unnamed: 0,i,a,ad,adult,approved,attributedvideourl,audioid,authorid,authoridstatus,b,...,v,w,y,post_day,post_month,post_year,post_date_string,party,collected_on,updated_on
394,3151052209,50259541921,0,0,1,https://cdn4.sharechat.com/contents/moj_315105...,870d7e3b-1266-4a69-ae0d-dd76153e21c9,50259541921,502595419212,https://cdn4.sharechat.com/42aca430-d398-47e5-...,...,https://cdn4.sharechat.com/contents/moj_315105...,1080,https://cdn4.sharechat.com/1e24ca27_1674017181...,16,6,2023,2023-06-16,modi,6-18,7-13
2140,3179383351,97524522611,0,0,1,https://cdn4.sharechat.com/contents/moj_317938...,fe64af1c-3267-43b0-b99e-2843872dc977,97524522611,975245226112,https://cdn4.sharechat.com/4150e3ca-edbf-4a6d-...,...,https://cdn-tc.sharechat.com/contents/moj_3179...,640,https://cdn4.sharechat.com/35ec8618_1688300187...,8,7,2023,2023-07-08,aaap,7-21,7-21
2349,3140541100,70317235831,0,0,1,https://cdn4.sharechat.com/contents/moj_314054...,985a27d6-51fd-42d3-9f47-4b8876e707f3,70317235831,703172358312,https://cdn4.sharechat.com/675d997a-6f17-418e-...,...,https://cdn-tc.sharechat.com/contents/moj_3140...,1080,https://cdn4.sharechat.com/d3d239a_16893336998...,8,6,2023,2023-06-08,yadav,7-14,7-21
1833,3169489071,15368643811,0,0,1,https://cdn4.sharechat.com/contents/moj_316948...,9d392ac0-64a0-47b5-800f-b09af34cd3d7,15368643811,153686438112,https://cdn4.sharechat.com/27b621e3-1c33-4615-...,...,https://cdn-tc.sharechat.com/contents/moj_3169...,720,https://cdn4.sharechat.com/tt_15368643811.jpeg...,30,6,2023,2023-06-30,aaap,7-21,7-21
2020,3174480787,76023385411,0,0,1,https://cdn4.sharechat.com/contents/moj_317448...,31a66621-ea86-4dc8-be07-3889af7b1ad7,76023385411,760233854112,https://cdn4.sharechat.com/5c65f0b3-a711-453b-...,...,https://cdn4.sharechat.com/contents/moj_317448...,1088,,4,7,2023,2023-07-04,narendramodi,7-13,8-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,3168363408,34345078831,0,0,1,https://cdn4.sharechat.com/contents/moj_316836...,b270303f-c84d-4c93-9f51-1b2c5cb82670,34345078831,343450788312,https://cdn4.sharechat.com/212de331-c6ca-49d1-...,...,https://cdn-tc.sharechat.com/contents/moj_3168...,720,https://cdn4.sharechat.com/3428b0d2_1687628901...,29,6,2023,2023-06-29,modi,7-20,7-26
832,3159732894,56445617811,0,0,1,https://cdn4.sharechat.com/contents/moj_315973...,0665164b-0116-4744-9058-0b854ae75172,56445617811,564456178112,https://cdn4.sharechat.com/ab8460ba-dbcd-4ffa-...,...,https://cdn4.sharechat.com/contents/moj_315973...,480,,23,6,2023,2023-06-23,owaisi,7-10,8-16
1681,3141239067,38059355911,0,0,1,https://cdn4.sharechat.com/contents/moj_314123...,12cb742e-fe3b-4745-9108-5aedcf6f733d,38059355911,380593559112,https://cdn4.sharechat.com/df1cedbc-4ba9-4c0b-...,...,https://cdn-tc.sharechat.com/contents/moj_3141...,480,https://cdn4.sharechat.com/tt_38059355911.jpeg...,8,6,2023,2023-06-08,bjp,6-24,7-26
2920,3161201837,74544887831,0,0,1,https://cdn4.sharechat.com/contents/moj_316120...,92e9bf1e-01a9-4e2c-9d47-bf966d184533,74544887831,745448878312,https://cdn4.sharechat.com/9d60f157-3804-46b4-...,...,https://cdn-tc.sharechat.com/contents/moj_3161...,1080,https://cdn4.sharechat.com/2b1eec8b_1689266801...,24,6,2023,2023-06-24,yogi,7-21,7-21


In [9]:
with open("creds.txt", "r") as credsfile:
    username = credsfile.readline().strip()
    password = credsfile.readline().strip()
# Get posts from database
db_url = f'postgresql://{username}:{password}@localhost:5432/moj'

engine = create_engine(db_url)

dbConnection = engine.connect();

query = text('select * from ocr')

ocrdf = pd.read_sql(query, dbConnection);

dbConnection.close();

In [4]:
audiodf = pd.read_csv("translations/audio/audio_translations_final_labels.csv")

In [5]:
filename_mapping_df = pd.read_csv("download_csvs/id_filename_mapping.csv")
audiodf= audiodf.merge(filename_mapping_df, on="filename")

In [6]:
len(audiodf)

3206

In [7]:
df.i.isin(audiodf.i.apply(str)).value_counts()

True     3206
False      15
Name: i, dtype: int64

In [8]:
missing_ids = df[~df.i.isin(audiodf.i.apply(str))].i

In [9]:
filename_mapping_df[filename_mapping_df.i.apply(str).isin(missing_ids)].filename

0       6P0OR443POT74XKY1x22TWkZNmPmLySjprbZ
3       2r7wN33WrwU5Xd0L7PbbIKZxp5wRK7SpBvBo
4       xPYr5BBLPrTOx6PADjeeuXd0wb9QyRf3Q4NY
5       8rZK0YYLrKU4BdrywPEESGR1WX1k2OUvXOD1
40                    32eb982c_1687584296231
50      2r7wN33WrwU5Xd0L7PbbIKZr9RZQJ3IplNnJ
58      5rwR5EEnrRUd0X6N2Z11T27NklwNYmcpmBwx
59      4rAQ0LLGrQUKdw6yVXpphPl2eDjLrlINKpJk
414     RV4L655WVLI9EjP0r2ppHd25Lgj6bmIwDxBG
711     11QA4nnK1AHERwnV6AyyTblPJ7r1AVUN9eD6
1255    OGV76559G7iLOWp75K00TmAdR3AgnBUVXZ43
1288    RV4L655WVLI9EjP0r2ppHd2ZGWbL2wUw2LyZ
1400    Nv3d655JvdTVOZ1ly0wwTvZgW1GE41UANPkQ
1903    Zm0ZD559mZCjx85k06XXFXybw5mvQ9Cdx6mj
1918    Drldx22ZrdUpXW2ENbPPFAKGxxgrlQSwr0xL
Name: filename, dtype: object

In [33]:
filename_mapping_df.i

0       3151895074
1       3133842580
2       3160379976
3       3142237093
4       3133160363
           ...    
3216    3137317224
3217    3142020177
3218    3141273341
3219    3142371153
3220    3147376196
Name: i, Length: 3221, dtype: int64

In [12]:
absolute_final_df = pd.read_csv("translations/audio/audio_translations_all_5_4.csv")
absolute_final_df.merge(filename_mapping_df, on="filename").to_csv("translations/audio/audio_translations_3221.csv")

In [13]:
audiodf.i

0       3150579325
1       3136427586
2       3138105939
3       3178049783
4       3169401772
           ...    
3201    3148856806
3202    3149848389
3203    3149583772
3204    3135317529
3205    3132130872
Name: i, Length: 3206, dtype: int64