In [None]:
!pip install bertopic



In [None]:
import pandas as pd
from bertopic import BERTopic
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

custom_stopwords = {'dont', 'think', 'people', 'know', 'japanese', 'thing', 'really', 'one',
                    'went', 'well', 'said', 'time', 'thats','got','didnt','wasnt', 'things',
                    'yeah', 'lot', 'say', 'remember', 'much', 'even', 'go', 'something',
                    'guess', 'see', 'oh', 'u', 'kind', 'used', 'us', 'would'}

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stopwords)

tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

def clean_text(text):
    words = tokenizer.tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words and not any(char.isdigit() for char in word)]
    stemmed_words = [p_stemmer.stem(word) for word in filtered_words]
    return ' '.join(stemmed_words)

df = pd.read_csv('All_category_3.csv')
transcripts = df['0'].dropna()

cleaned_transcripts = [clean_text(transcript) for transcript in transcripts]

#num of topics <= 15
topic_model = BERTopic(language="english", nr_topics=15)
topics, probs = topic_model.fit_transform(cleaned_transcripts)

topic_model.get_topics()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{-1: [('interview', 0.03564989612414551),
  ('famili', 0.02735624906496101),
  ('like', 0.023188694246710266),
  ('get', 0.020152490971007183),
  ('go', 0.019117989117878598),
  ('father', 0.01845781301225887),
  ('american', 0.017853080060934355),
  ('camp', 0.01783960786324188),
  ('parent', 0.017523711001407715),
  ('back', 0.01706226013308937)],
 0: [('father', 0.024102183243119843),
  ('like', 0.022124815478746944),
  ('famili', 0.02189988326442143),
  ('get', 0.01753474131799858),
  ('happen', 0.017405733468839824),
  ('case', 0.017239151908489524),
  ('question', 0.016411576713974363),
  ('go', 0.016361398756419204),
  ('back', 0.016270865877917914),
  ('talk', 0.01530045294912201)],
 1: [('camp', 0.06327291002607602),
  ('area', 0.036309913135418224),
  ('barrack', 0.035988177359812065),
  ('block', 0.03554153489733309),
  ('room', 0.02858759516638631),
  ('live', 0.026374260984657835),
  ('hotel', 0.02613433152007483),
  ('els', 0.02419851659818977),
  ('train', 0.022295945429

In [None]:
freq = topic_model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3640,-1_interview_famili_like_get,"[interview, famili, like, get, go, father, ame...",[um want ask allud mention prison assembl cent...
1,0,13533,0_father_like_famili_get,"[father, like, famili, get, happen, case, ques...","[interview happen famili, father reason fbi ca..."
2,1,3150,1_camp_area_barrack_block,"[camp, area, barrack, block, room, live, hotel...","[camp, camp, camp]"
3,2,1142,2_caucasian_american_ethnic_white,"[caucasian, american, ethnic, white, student, ...","[caucasian, caucasian, caucasian]"
4,3,780,3_farm_land_money_farmer,"[farm, land, money, farmer, paid, sell, market...","[busi leas farm properti farm, take farm like ..."
5,4,254,4_lake_tule_water_river,"[lake, tule, water, river, bath, jerom, term, ...","[interview place tule lake, soon mention tule ..."
6,5,100,5_quit_long_gener_record,"[quit, long, gener, record, gordon, public, pr...","[quit quit handl problem, quit quit handl prob..."
7,6,74,6_tension_friction_physic_examin,"[tension, friction, physic, examin, weak, caus...","[mention tension, happen tension, happen tension]"
8,7,54,7_rumor_fred_beat_owen,"[rumor, fred, beat, owen, valley, apart, heard...","[rumor around area, rumor go around, rumor go ..."
9,8,50,8_base_build_custom_popul,"[base, build, custom, popul, schedul, devast, ...","[base, base, base]"


In [None]:
#Examples
topic0 = topic_model.get_topic(0)
topic1 = topic_model.get_topic(1)
'Topic0', topic0, '**********************************************','Topic 1',topic1

('Topic0',
 [('father', 0.024102183243119843),
  ('like', 0.022124815478746944),
  ('famili', 0.02189988326442143),
  ('get', 0.01753474131799858),
  ('happen', 0.017405733468839824),
  ('case', 0.017239151908489524),
  ('question', 0.016411576713974363),
  ('go', 0.016361398756419204),
  ('back', 0.016270865877917914),
  ('talk', 0.01530045294912201)],
 '**********************************************',
 'Topic 1',
 [('camp', 0.06327291002607602),
  ('area', 0.036309913135418224),
  ('barrack', 0.035988177359812065),
  ('block', 0.03554153489733309),
  ('room', 0.02858759516638631),
  ('live', 0.026374260984657835),
  ('hotel', 0.02613433152007483),
  ('els', 0.02419851659818977),
  ('train', 0.022295945429379856),
  ('car', 0.022057728568167064)])

In [None]:
topic_model.visualize_barchart(top_n_topics= 15)