## -Welcome to our Twitter Bot Topic Modeler-

In [24]:
import pandas as pd

### 1. Loading the raw data

In [25]:
df = pd.read_csv('data/processed.csv')
print(df.shape)
print(df.columns)
print(df.head())

(237487, 2)
Index(['labels', 'text'], dtype='object')
  labels                                               text
0    bot  @aprilPINKie Justice isn’t going anywhere right?!
1    bot  Last game at DKR in 2022! Longhorns for Christ...
2    bot  @CFBONFOX Texas goes 4-0 and makes the big 12 ...
3    bot                      @hookemcowboys It’s from 2020
4    bot  @CFBONFOX The Texas Longhorns. Nothing better ...


##### (a) Remove non-bot (human) tweets

In [26]:
bot_tweets = df[(df['labels'] == 'bot')]
print(bot_tweets.shape)
print(bot_tweets.head())

(14581, 2)
  labels                                               text
0    bot  @aprilPINKie Justice isn’t going anywhere right?!
1    bot  Last game at DKR in 2022! Longhorns for Christ...
2    bot  @CFBONFOX Texas goes 4-0 and makes the big 12 ...
3    bot                      @hookemcowboys It’s from 2020
4    bot  @CFBONFOX The Texas Longhorns. Nothing better ...


### 2. Data Cleaning
Here we will tidy up the tweets. Remove hashtags, links, @mentions, and any punctuation and emojis and marking if a tweet is a retweet. We will also be converting those words into to their root form in the process called rooting/lemmatization.

Our topic model needs need these things removed before converting it into vectors.

In [27]:
import nltk
import emoji # python3 -m pip install emoji --upgrade
from nltk.tokenize import RegexpTokenizer
# run the following if you get an SSL error
# "/Applications/Python 3.10/Install Certificates.command"
import re
import string


In [28]:
stopWords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
#patterns
mentions_pattern =re.compile(r'@\w*')
url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
hashtags_pattern = re.compile(r'#\w*')
reserved_words_pattern = re.compile(r'(RT|rt|FAV|fav|VIA|via)')
empty_spaces = re.compile(r'\s{2,}|\t')

In [29]:
def tweet_cleaner(text):

    text = str(text) + ""
    #set text to lowercase
    text = text.lower()

    #remove mentions
    text = re.sub(pattern=mentions_pattern, repl='', string=text)

    #remove urls
    text = re.sub(pattern=url_pattern, repl='', string=text)

    #remove links
    text = re.sub(r'http(s)?\S+', '', text)

    #remove @ tags of users
    text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)  # remove retweet
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)  # remove tweeted at
    
    #remove twitter reserved_words
    text = re.sub(pattern=reserved_words_pattern, repl='', string=text)

    #remove hashtags
    #text = re.sub(pattern=hashtags_pattern, repl='', string=text)

    #remove emojis
    text = emoji.replace_emoji(text, "")

    #remove words with less than 3 characters
    text_list = text.split(' ')
    for x in text_list:
        if len(x) <= 2:
            text_list.remove(x)
    text = ' '.join(text_list)

    #remove stopwords and apply rooting
    if len(text) > 0:
        text = "the " + text
        newText = [word for word in text.split(' ')
                            if word not in stopWords]
        newText = [word_rooter(word) if '#' not in word else word
                            for word in newText]
        text = ' '.join(newText)

    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)) #this removes hashtag symbols as well
    text = re.sub('[' + '!"$%&\'’()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`' + ']+', '', text)

    #remove numbers
    text_list = text.split(' ')
    for x in text_list:
        if x.isnumeric():
                text_list.remove(x)
    text = ' '.join(text_list)
    #text = re.sub('([0-9]+)', '', text)
        
    #remove words with 2 or less characters
    text_list = text.split(' ')
    for x in text_list:
        if len(x) <= 2:
            text_list.remove(x)
    text = ' '.join(text_list)

    #remove empty spaces
    text = re.sub(pattern=empty_spaces, repl=' ', string=text)
    text = re.sub('\s+', ' ', text)

    #remove empty lines
    text = re.sub('\s+', ' ', text)
    
    return text.strip()


In [30]:
bot_tweets['cleaned_bot_tweets']  = bot_tweets['text'].apply(lambda x: tweet_cleaner(x))
bot_tweets = bot_tweets[(bot_tweets['cleaned_bot_tweets'].str.len() > 2)]
print(bot_tweets.shape)
print(bot_tweets)

(13599, 3)
       labels                                               text  \
0         bot  @aprilPINKie Justice isn’t going anywhere right?!   
1         bot  Last game at DKR in 2022! Longhorns for Christ...   
2         bot  @CFBONFOX Texas goes 4-0 and makes the big 12 ...   
4         bot  @CFBONFOX The Texas Longhorns. Nothing better ...   
5         bot  @UTBarstool Unfortunately we don’t play for tw...   
...       ...                                                ...   
226413    bot  @prof_mirya Why doesn’t gd give us a November ...   
226414    bot  @SumitaPahwa @alixabeth Absolutely lovely Xmas...   
226416    bot  @prof_mirya I would watch a demo of you packin...   
226417    bot  @prof_mirya “ Dream of living out in the woods...   
226418    bot                         @dannagal It’s the smolder   

                                       cleaned_bot_tweets  
0                               justic isnt anywher right  
1       last game dkr longhorn christ park lot open…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bot_tweets['cleaned_bot_tweets']  = bot_tweets['text'].apply(lambda x: tweet_cleaner(x))


In [31]:
bot_tweets.drop(columns='text', inplace=True)
bot_tweets.drop(columns='labels', inplace=True)
print(bot_tweets.shape)
print(bot_tweets.head())

(13599, 1)
                                  cleaned_bot_tweets
0                          justic isnt anywher right
1  last game dkr longhorn christ park lot open…so...
2  texa goe make big championship end rematch bam...
4             texa longhorns noth better burnt orang
5                          unfoun dont play two week


### 3. Topic Modelling
In this stage we will build a fitted topic model for the bot tweets. Model will be our LDA (Latent Dirichlet Allocation) algorithm model object which will holds parameters such as the number of topics that we provided it when we made it. Model also it also stores functions such as the fitting method. Once it is fit it will store fitted parameters that will tell us how valuable different words are in various topics. 

##### Lets take a look at the most repeated tweets

In [32]:
print("There are " + str(bot_tweets["cleaned_bot_tweets"].unique().shape[0]) + " unique tweets.")
most_repeated_tweets = bot_tweets.groupby(['cleaned_bot_tweets']).size().reset_index(name='count').sort_values('count', ascending=False).head(10)
print("Most repeated tweets:")
print(most_repeated_tweets)

There are 12436 unique tweets.
Most repeated tweets:
                                      cleaned_bot_tweets  count
8448   s4s lt3 view free sex video collection updat e...     47
6093   lt3 view free sex video collection updat every...     46
6934                                                nice     42
2932   environment plan essenti oper everi project he...     40
2009                                     congratulations     37
9637                                               thank     37
9855                                           thank you     33
5296   join intern conferec upadsd cairo egypt abstra...     30
1953                                           congratul     30
11463                                                wow     25


In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#highest max_df is 0.045
#lowest min_df is ...
#Research shows that 25% of blog posts are made up of stop words so well set max_df to 0.25
vectorizer = CountVectorizer(max_df=0.25, min_df=0.005, token_pattern='\w+|\$[\d\.]+|\S+') # used to transform text to vector form

dtm = vectorizer.fit_transform(bot_tweets["cleaned_bot_tweets"]).toarray()

featureNames = vectorizer.get_feature_names_out() # what word each column in the matric represents

numberOfTopicsToDisplay = 15

model = LatentDirichletAllocation(n_components=numberOfTopicsToDisplay, random_state=0)

model.fit(dtm)

def print_topics(model, featureNames, noTopWords):
    topic_map = {}
    for topicIdx, topic in enumerate(model.components_):
        topic_map["Topic %d list" % (topicIdx + 1)]= ['{}'.format(featureNames[i])
                        for i in topic.argsort()[:-noTopWords - 1:-1]]
        topic_map["T%d weights" % (topicIdx + 1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-noTopWords - 1:-1]]
    return pd.DataFrame(topic_map)

noTopWords = 10
print_topics(model, featureNames, noTopWords)

Unnamed: 0,Topic 1 list,T1 weights,Topic 2 list,T2 weights,Topic 3 list,T3 weights,Topic 4 list,T4 weights,Topic 5 list,T5 weights,...,Topic 11 list,T11 weights,Topic 12 list,T12 weights,Topic 13 list,T13 weights,Topic 14 list,T14 weights,Topic 15 list,T15 weights
0,aificialintelligence,169.1,one,403.8,video,319.1,paper,393.8,new,378.4,...,help,172.4,use,431.0,like,342.0,would,246.9,get,430.5
1,tech,169.1,day,336.7,free,229.1,accept,172.1,year,238.3,...,everi,147.1,que,284.1,peopl,251.8,think,238.4,find,218.5
2,cloud,155.0,hope,169.3,lt3,217.1,full,154.1,see,227.6,...,project,146.8,para,161.1,make,243.1,want,203.9,still,215.8
3,robot,132.1,del,127.1,view,140.1,science,130.1,pleas,190.8,...,link,146.3,vegaswap,141.1,way,223.4,even,152.8,know,138.1
4,technology,117.3,code,124.0,updat,136.9,research,129.8,look,179.8,...,happi,137.0,crypto,134.1,that,161.2,train,144.1,now,137.1
5,iot,110.1,nice,122.1,work,129.7,good,129.2,work,171.0,...,plan,122.5,blockchain,113.1,take,160.7,model,122.5,let,119.5
6,ai,107.1,today,104.1,sex,127.1,publish,127.1,data,163.8,...,may,120.5,defi,108.1,talk,157.1,like,118.5,come,114.4
7,machinelearning,106.1,first,95.9,right,119.7,2022,125.1,excit,155.1,...,confer,119.4,cryptocurrency,102.1,dont,130.9,time,103.9,work,114.3
8,con,105.2,orang,93.1,everyday,112.1,web,119.1,share,151.1,...,meet,119.1,market,99.5,life,130.9,move,92.1,this,109.1
9,cloudcomputing,100.1,news,88.9,improv,106.4,phd,119.1,present,147.1,...,use,115.0,vga,86.1,need,130.5,know,86.3,play,100.5
