# GitH_train_model

## Data

In [None]:
import pandas as pd


In [None]:
!unzip ResPapers.zip

In [None]:
data = pd.read_csv('ResPapers.csv')

In [None]:
docs = data["ABSTRACT"].to_list(); data.shape[0]

### Text processing

We use scikit-learn CountVectorizer to tokenize and prepare the vocabulary, retaining only words that appear at least
10 times in the dataset.

In [None]:
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

### Tokenize

In [None]:
# Extract words with a minimum frequency of 5
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 5]; len(vocab)

### Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# Create embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(docs, show_progress_bar=True)

In [None]:
# Save embeddings
import numpy as np
with open('embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

In [None]:
# Load embeddings
embeddings = np.load('embeddings.npy')

### Train model

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

# Prepare sub-models
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20)
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english", ngram_range=(1, 3))

topic_model = BERTopic(
        embedding_model=model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        verbose = True
).fit(docs)

In [None]:
#save topic model
topic_model.save('ResPapers_model')

In [None]:
topic_model.get_topic_info()

### Visualize

In [None]:
topic_model.visualize_barchart()