In [125]:
import json
import pandas as pd
import numpy as np

In [148]:
class Config:
    def __init__(self):
        self.link_to_dataset = 'decahose_polls_2021-08_100k.txt'

        self.only_english = True
        self.only_retweeted = False
        
        self.KforKMeans = 25

config = Config()

In [67]:
class Reader:
    def __init__(self, link_to_dataset):
        self.link_to_dataset = link_to_dataset

    def read(self):
        polls = []
        with open(self.link_to_dataset) as f:
            for line in f.readlines():
                polls.append(json.loads(line))

        df = pd.DataFrame.from_records(polls)
        return df

In [116]:
class Preprocessor:
    def __init__(self, df):
        self.df = df

    def preprocess_df(self, only_english = True, only_retweeted = False):
        # df.drop(df[df['withheld_in_countries'] == True].index, inplace=True)
        for col in self.df.columns:
            try:
                if self.df[col].nunique() == 0 or self.df[col].nunique() == 1:
                    print(f"dropping {col}")
                    self.df.drop(columns = [col], inplace = True)
                    continue
            except:
                pass

            try:
                if self.df[col].notnull().sum() == 0:
                    print(f"dropping {col}")
                    self.df.drop(columns = [col], inplace = True)
                    continue
            except:
                pass

        self.df['RT'] = self.df['text'].str.startswith('RT', na = False)
        if only_retweeted:
            self.df = self.df[self.df['RT'] == True]

        if only_english:
            self.df = self.df[self.df['lang'] == 'en']


    def preprocess_text(self):
        # TODO 
        return

    def display_info(self):
        print()
        print(self.df.head(5))
        print(f"\nShape: {self.df.shape}")


    def get_dataframe(self):
        self.df.to_csv('./processed_polls.csv')
        self.display_info()
        return self.df        



In [117]:
df = Reader(config.link_to_dataset).read()

In [118]:
preprocessor = Preprocessor(df, only_english = config.only_english, only_retweeted = config.only_retweeted)
preprocessor.preprocess_df()
preprocessor.preprocess_text()

df = preprocessor.get_dataframe()

dropping quote_count
dropping contributors
dropping reply_count


dropping favorite_count
dropping retweeted
dropping coordinates
dropping retweet_count
dropping favorited
dropping geo
dropping filter_level

   truncated                                               text  \
0      False  RT @CaesarsSports: Thursday Dingers is coming ...   
1      False  RT @Openly: Should sports bodies allow interse...   
2      False                       RT @TheSkyBlueHub: Defenders   
5      False                 How often do you get your haircut?   
6      False                              Let's settle a debate   

   is_quote_status  in_reply_to_status_id                   id  \
0            False                    NaN  1422682179906199556   
1            False                    NaN  1422682226488188940   
2            False                    NaN  1422682282679226368   
5            False                    NaN  1422682359430885382   
6            False                    NaN  1422682375780188164   

                                            entities   tim

In [108]:
df.shape

(48865, 29)

In [121]:
df.text

0        RT @CaesarsSports: Thursday Dingers is coming ...
1        RT @Openly: Should sports bodies allow interse...
2                             RT @TheSkyBlueHub: Defenders
5                       How often do you get your haircut?
6                                    Let's settle a debate
                               ...                        
99986    RT @kpopidolvoting: Free‼️ Free‼️ Free‼️\n\n5k...
99988    Do I drop Jalen Hurts and pick up Mac Jones? 1...
99993    RT @CosmicAwards: Who's the artist of the mont...
99996    ✨ Intuition Test 2! ✨ \n\nAnother adventure! W...
99997    RT @MikeAdamOnAir: What’s your favorite new so...
Name: text, Length: 48865, dtype: object

In [143]:
class SentenceTransformerEmbedding:
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def embed(self, texts):
        texts = np.array(texts)
        self.embeddings = self.embedder.encode(texts)

        return self.embeddings

In [144]:
embeddings = SentenceTransformerEmbedding().embed(df.text)

In [145]:
embeddings.shape

(48865, 384)

In [159]:
class KMeansClustering():
    def __init__(self, k = 25):
        from sklearn.cluster import KMeans
        self.clusterer = KMeans(n_clusters = k, random_state=0)

    def fit(self, embeddings):
        self.clusterer.fit(embeddings)
        self.cluster_assignments = self.clusterer.labels_
        self.cluster_centers = self.clusterer.cluster_centers_

    def sample_from_cluster(self, df, cluster_number, n = 10):
        df['cluster'] = self.cluster_assignments
        df_cluster = df[df['cluster'] == cluster_number]
        
        texts = list(df_cluster.sample(n)['text'])
        return texts

In [160]:
clusterer = KMeansClustering(k = config.KforKMeans)
clusterer.fit(embeddings)

In [167]:
clusterer.sample_from_cluster(df, 2)

['RT @Jual_Kupon_TTA: 💥LAST GIVEAWAY💥\n\n💢Winner takes call💢\n\n10 coupons contain 30 tickets\n\nRules : FOLLOW &amp; RETWEET\n\nEND : 18 Hours',
 'RT @Jual_Kupon_TTA: 💥LAST GIVEAWAY💥\n\n💢Winner takes call💢\n\n10 coupons contain 30 tickets\n\nRules : FOLLOW &amp; RETWEET\n\nEND : 18 Hours',
 'RT @Jual_Kupon_TTA: 💥LAST GIVEAWAY💥\n\n💢Winner takes call💢\n\n10 coupons contain 30 tickets\n\nRules : FOLLOW &amp; RETWEET\n\nEND : 18 Hours',
 'RT @Jual_Kupon_TTA: 💥LAST GIVEAWAY💥\n\n💢Winner takes call💢\n\n10 coupons contain 30 tickets\n\nRules : FOLLOW &amp; RETWEET\n\nEND : 18 Hours',
 'RT @Jual_Kupon_TTA: 🌟[GIVEAWAY]🌟\n\n💫FIRST PLACE WINNER GET 7 COUPON ISI 30 TIKET💫\n\n💫SECOND PLACE GET 3 COUPON ISI 30 TIKET💫\n\nRULES : FOLLOW…',
 'RT @Jual_Kupon_TTA: 💥LAST GIVEAWAY💥\n\n💢Winner takes call💢\n\n10 coupons contain 30 tickets\n\nRules : FOLLOW &amp; RETWEET\n\nEND : 18 Hours',
 'RT @Jual_Kupon_TTA: 🔥[Another Giveaway]🔥\n\nFirst place gets 7 coupons isi 30 tickets\n\nSecond place gets 3 coupons i

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)