# setup

In [2]:
!pip install bertopic
#!pip install bertopic --no-build-isolation --no-binary :all: # in case of conflicting with numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.12.0-py2.py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 4.5 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 7.2 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 59.0 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml<6.0
  Downloading PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
[K     |████████████████████████████████| 662 kB 70.6 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 6.1 MB/s 
Collectin

# load data

In [3]:
#@title Drive Downloader

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os

download_with_pydrive = True #@param {type:"boolean"}  

class Downloader(object):
    def __init__(self, use_pydrive):
        self.use_pydrive = use_pydrive

        if self.use_pydrive:
            self.authenticate()
        
    def authenticate(self):
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        self.drive = GoogleDrive(gauth)
    
    def download_file(self, file_id, file_dst):
        if self.use_pydrive:
            downloaded = self.drive.CreateFile({'id':file_id})
            downloaded.FetchMetadata(fetch_all=True)
            downloaded.GetContentFile(file_dst)
        else:
            !gdown --id file_dst

downloader = Downloader(download_with_pydrive)

In [4]:
current_directory = os.getcwd()
save_path = os.path.join(os.path.dirname(current_directory), "data")
os.makedirs(save_path, exist_ok=True)
model_path = os.path.join(save_path, 'sent_tweet_data.csv')

In [5]:
downloader.download_file("1k8HXp4vUBBbVqKu3kBYnu-HLDQdAdzrf", file_dst=model_path)

In [7]:
import pandas as pd

In [8]:
data = pd.read_csv("/data/sent_tweet_data.csv")
data.head()

Unnamed: 0,Tweet_Id,Tweet,sentiment_analysis,sentiment,score
0,1.598104e+18,@elonmusk I thunk you're an irresponsible for ...,"('negative', 0.8987942934036255)",negative,0.898794
1,1.598104e+18,"Apparently ""I've had COVID for over a week and...","('negative', 0.5844972133636475)",negative,0.584497
2,1.598103e+18,@snakewasright @EllenOl00313689 @iTruthSearch ...,"('negative', 0.7049756646156311)",negative,0.704976
3,1.598103e+18,They're dropping the sequel to COVID baby wooo...,"('neutral', 0.5086733102798462)",neutral,0.508673
4,1.598101e+18,@1goodtern They drop dead one by one of post C...,"('negative', 0.47382649779319763)",negative,0.473826


## positive data 

In [9]:
# get the tweet text and sentiment columns whre sentiment is positive as list
positive_data = data[data["sentiment"] == "positive"]
positive_data.head()


Unnamed: 0,Tweet_Id,Tweet,sentiment_analysis,sentiment,score
8,1.598098e+18,"My friend spencer has covid, good thing he ate...","('positive', 0.48467978835105896)",positive,0.48468
16,1.598092e+18,"""I had my baby shower here. Very big space, th...","('positive', 0.9333032369613647)",positive,0.933303
23,1.598088e+18,"Nat de covid 😭✨ Se cuide, baby https://t.co/yE...","('positive', 0.7633681893348694)",positive,0.763368
36,1.598079e+18,Get Well Soon my baby 🥹🍼 I know you're strong ...,"('positive', 0.947133481502533)",positive,0.947133
75,1.598065e+18,There’s a chosen family baby I know who’s just...,"('positive', 0.5353631377220154)",positive,0.535363


In [10]:
positive_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45263 entries, 8 to 446181
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Tweet_Id            45263 non-null  float64
 1   Tweet               45263 non-null  object 
 2   sentiment_analysis  45263 non-null  object 
 3   sentiment           45263 non-null  object 
 4   score               45263 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.1+ MB


## Some Tweet Cleaning

In [11]:
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
URL_REGEX = re.compile('http(s)?:\/\/t.co\/\w+')
MENTION_REGEX = re.compile('@\w+')

def clean_tweet(tweet):
    # remove mentions, the pound sign, and replace urls with URL token
    tweet = re.sub(URL_REGEX, '', tweet)
    tweet = re.sub(MENTION_REGEX, '', tweet)
    tweet = tweet.replace('#', '')
    tweet = re.sub(r'\d+', '', tweet)
    return tweet.strip()

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)


def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_emojis(text):
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    return wierd_pattern.sub(r'', text)

In [13]:
def tweet_cleaner(tweet):
    tweet = clean_tweet(tweet)
    tweet = remove_emojis(tweet)
    tweet = remove_punctuation(tweet)
    tweet = remove_stopwords(tweet)
    tweet = lemmatizing(tweet)
    return tweet

In [14]:
positive_data["cleaned_tweet"] = positive_data["Tweet"].apply(lambda s: tweet_cleaner(s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_data["cleaned_tweet"] = positive_data["Tweet"].apply(lambda s: tweet_cleaner(s))


In [15]:
positive_data.head()

Unnamed: 0,Tweet_Id,Tweet,sentiment_analysis,sentiment,score,cleaned_tweet
8,1.598098e+18,"My friend spencer has covid, good thing he ate...","('positive', 0.48467978835105896)",positive,0.48468,My friend spencer covid good thing ate pound e...
16,1.598092e+18,"""I had my baby shower here. Very big space, th...","('positive', 0.9333032369613647)",positive,0.933303,I baby shower Very big space owner accommodati...
23,1.598088e+18,"Nat de covid 😭✨ Se cuide, baby https://t.co/yE...","('positive', 0.7633681893348694)",positive,0.763368,Nat de covid Se cuide baby
36,1.598079e+18,Get Well Soon my baby 🥹🍼 I know you're strong ...,"('positive', 0.947133481502533)",positive,0.947133,Get Well Soon baby I know youre strong COVID w...
75,1.598065e+18,There’s a chosen family baby I know who’s just...,"('positive', 0.5353631377220154)",positive,0.535363,There ’ chosen family baby I know ’ cuddliest ...


In [16]:
positive_tweets = positive_data["cleaned_tweet"].tolist()

In [17]:
len(positive_tweets)

45263

In [18]:
from bertopic import BERTopic

[Topic Model Spesifications:](https://www.sbert.net/docs/pretrained_models.html#other-models) 

In [19]:
topic_model = BERTopic(verbose=True, embedding_model="all-mpnet-base-v2", min_topic_size=70,  calculate_probabilities=False)

In [20]:
topics, probs = topic_model.fit_transform(positive_tweets)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1415 [00:00<?, ?it/s]

2022-12-21 21:23:48,153 - BERTopic - Transformed documents to Embeddings
2022-12-21 21:25:16,745 - BERTopic - Reduced dimensionality
2022-12-21 21:25:23,546 - BERTopic - Clustered reduced embeddings


In [21]:
freq = topic_model.get_topic_info()

In [22]:
freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,1992,-1_covid_baby_vaccine_book
1,0,41548,0_covid_baby_pregnant_birth
2,1,250,1_awwrrite_pau_hana_another
3,2,218,2_florence_nurses_nightingale_nurse
4,3,170,3_walrus_zoo_announced_tiger
5,4,162,4_offer_protection_best_lilybass
6,5,143,5_movement_kicksstillcount_usually_midwives
7,6,143,6_offer_protection_best_woman
8,7,140,7_mondayfriday_helpline_ampm_relating
9,8,117,8_meet_ciara_win_vogue


In [23]:
topic_model.get_topic(0)  # Select the most frequent topic

[('covid', 0.04586265892117062),
 ('baby', 0.03314259972855368),
 ('pregnant', 0.024139047972674486),
 ('birth', 0.023760891825363326),
 ('pregnancy', 0.02295626374194037),
 ('amp', 0.018665354216312942),
 ('vaccine', 0.017568640569704797),
 ('get', 0.015928794610321467),
 ('year', 0.014696236993559942),
 ('got', 0.014054257190675113)]

In [24]:
topic_model.get_topic(1)

[('awwrrite', 0.6084702700411074),
 ('pau', 0.257108074572489),
 ('hana', 0.25680978192665505),
 ('another', 0.21293691820284497),
 ('day', 0.1884803404768704),
 ('made', 0.1843241796892119),
 ('time', 0.17003288596742347),
 ('ot', 0.0929946293205013),
 ('baby', 0.08554083317551507),
 ('eth', 0.08462769867456921)]

In [25]:
topic_model.get_topic(2)

[('florence', 0.18872762630670584),
 ('nurses', 0.15930879072892584),
 ('nightingale', 0.14483905373101733),
 ('nurse', 0.14115859120708202),
 ('anniversary', 0.12863400591356125),
 ('international', 0.112623319909238),
 ('internationalnursesday', 0.10163086417742599),
 ('day', 0.07854208273888744),
 ('birth', 0.07559745420752954),
 ('th', 0.07319681215245964)]

# Topic Visualization

In [26]:
topic_model.visualize_topics()

In [27]:
topic_model.visualize_barchart()

In [28]:
topic_model.visualize_heatmap()

In [29]:
topic_model.visualize_term_rank()

In [30]:
topic_model.visualize_hierarchy()

## Save/Load BERTopic model

In [31]:
topic_model.save("positive_model")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [32]:
topic_model = BERTopic.load("positive_model")
