In [1]:
# Libraries
import re
import os
import nltk
import requests
import unidecode
import unicodedata
import contractions
import pandas as pd
import datetime as dt
from dotenv import load_dotenv
from collections import Counter
from nltk.corpus import stopwords
from googletrans import Translator

import warnings
warnings.filterwarnings('ignore')

In [2]:
# loadinng credenttials as environmen variables
load_dotenv('/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/credentials.env', 
            override = True)

True

In [3]:
# getting twitter credentials
twitter_key = os.environ.get('api_key')
twitter_secret_key = os.environ.get('api_secret_key')
bearer_token = os.environ.get('bearer_token')

In [4]:
# to get city coordinates
#geolocator = Nominatim(user_agent = 'bmartin')

# Get current date
today = dt.datetime.today()
#today = today.strftime("%Y-%m-%d %H:%M")
today = today.strftime("%Y%m%d_%H_%M")
today

'20221101_20_05'

## 1. Twitter API

### 1.1 About the Twitter API

The Twitter API can be used to retrieve and analyze data, as well as engage with the conversation on Twitter. It provides access to a variety of different resources including:
- Tweets
- Users
- Direct Messages
- Lists
- Trends
- Media
- Places

The Twitter API currently consists of two supported versions, as well as different access tiers. 
- **Standard v1.1**: The legacy standard endpoints provide access to the following resources with the standard v1.1 offerings.
    
    - Get Tweet timelines
    - Curate a collection of Tweets
    - Filter realtime Tweets
    - Sample realtime Tweets
    - Manage and pull public account information
    - Get trends near a location
    - Get locations with trending topics
    - Get information about a place    
    
    
- **Twitter API v2 Early Access**: A new Twitter API is being build with a modern and more sustainable foundation as well as an improved developer experience. The first endpoints are now available within Early Access, and enable users to listen to and analyze the public conversation. Additional endpoints, features, and access levels will be released soon.
    
    - Ability to request specific objects and fields.
    - New and more detailed data objects
    - Advanced metrics return in Tweets (including impressions, video views, user profile and URL clicks)
    - Insights on Tweet topics with annotations (filter by topic using `entity` and `context` operators)
    - Improved conversation tracking
    - Academic Research product track (grants free access to full-archive search)
    - High confidence spam filtering

## 1.2 Retrieve data
### 1.2.1 Search Tweets

The [search endpoint](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent) returns Tweets from the last seven days that match a search query. Parameters are listed below:
- `query`(required): rule for matching Tweets.
- `expansions`: Expansions enable users to request additional data objects that relate to the originally returned Tweets.
- `max_results`: The maximum number of results to be returned.
- `next_token`: This parameter is used to get the next 'page' of results. 
- `place.fields`: Enables to select specific place fields that will be delivered in each returned Tweet. 
- `tweet.fields`: Enables to select specific Tweet fields that will be delivered in each returned Tweet object.
- `user.fields`: Enables to select specific user fields that will be delivered in each returned Tweet.

In [5]:
def search_tweets(query, bearer_token = bearer_token, next_token = None):    
    """
    Function to request tweets according to a specific query.
    
    Inputs:
        - query: A string that will be used to find tweets.
                 Tweets must match this string to be returned.
        - bearer_token: Security token from Twitter API.
        - next_token: ID of the next page that matches the specified query.
        
    Outputs: Dictionary (json type) with the requested data.  
    """
    
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    
    # end point
    url = f"https://api.twitter.com/2/tweets/search/recent?query={query}&"

    params = {
        # select specific Tweet fields from each returned Tweet object
        'tweet.fields': 'text,created_at,lang,possibly_sensitive', # public_metrics
        
        # maximum number of search results to be returned (10 - 100)
        'max_results': 100,
        
        # additional data that relate to the originally returned Tweets
        'expansions': 'author_id,referenced_tweets.id,geo.place_id',
        
        # select specific place fields 
        "place.fields": 'country,full_name,name',
        
        # select specific user fields
        "user.fields": 'location',
        
        # get the next page of results.
        "next_token": next_token,
    }
    
    # request
    response = requests.get(url = url, params = params, headers = headers)

    # verify successfull request
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    else:
        return response.json()

In [6]:
query = "world cup"

# search term
search_tweet = search_tweets(query = query)

# 4 main keys
search_tweet.keys()

dict_keys(['data', 'includes', 'meta'])

## 2. Generate some data

After having requested tweets data, some dataframes were generated with different data.

- **tweets**: Pandas dataframe with information about tweets. The data from this dataframe is the required to perform text classification as well as to generate some visualizations. Columns:
    - `text`: Tweet content.
    - `lang`: Tweets' original language. Although some tweets are not in English, during the pre-processing process tweets are translated from the origin language to English.
    - `possibly_sensitive`: Boolean. Specifies if the tweet might be sensitive. 
    - `tweet_id`: Tweet's unique identifier.
    - `created_at`: When the tweet was created (tweeted).
    - `type`: Type of tweet: original tweet, replied (reply from another tweet), quoted or retweeted.
    
    
- **users**: Pandas dataframe with users information. Columns:
    - `user_id`: User's unique identifier.
    - `username`: User's username.
    - `name`: Name that is displayed on Twitter.
    - `location`: User's location. In Twitter there is this field in the user's biography were users can specify their location. Maybe this location is where they were born, where they currently live or just a random place. However, many users do not really include a geographical location in there, some of them just write something else such as their pronouns. So this field do not necessary specifies a geographical location.


- **places**: Pandas dataframe about places where a tweet was tweeted.
    - `country`: Country where a tweet was tweeted.
    - `full_name`: City, countrty where a tweet was tweeted.
    - `geo.place_id`: Unique identifier of location.
    - `name`: City name where a tweet was tweeted.

In [7]:
def create_dataframes(json_tweets, today):
    
    """
    Function to create and organize different data into specific data frames.
    
    Inputs:
        - json_tweets: A dictionary with tweets data.
    
    Outputs: 
        - tweets: Pandas dataframe with relevant information about tweets (to
                  further perform text classification).
                  
        - users: Pandas dataframe with users information.
        
        - places (optional): Pandas dataframe about places where users tweeted. If not a 
                  single tweets contains the place where it was tweeted, then
                  this dataframe will not be returned.
    """
        
    # Create users dataframe
    users = pd.json_normalize(json_tweets['includes']['users']).rename(columns = {"id":"user_id"})
    
    # Create df with tweet's data
    tweets = pd.json_normalize(json_tweets['data']).rename(columns = {"id":"tweet_id", 
                                                                      "geo.place_id":"geo_place_id"})
        
    # Get tweet's type
    tweets['type'] = tweets.referenced_tweets.apply(lambda x: x[0]["type"] if type(x) == list else None)
    
    # get referenced tweets ids
    tweets["ref_tweet_id"] = tweets.referenced_tweets.apply(lambda x: x[0]['id']\
                                                            if isinstance(x, list) else x)

        
    # Drop retweeted tweets and tweets with undefined anguage
    #tweets = tweets[tweets["type"] != "retweeted"].reset_index(drop = True)
    tweets = tweets[tweets["lang"] != "und"]
        
    # id to string
    tweets["tweet_id"] = tweets["tweet_id"].astype(str)
        
    ## List of users in tweets dataframe to only 
    ## keep users from tweets dataframe
    #user_list = tweets.author_id.unique()
    #users = users.loc[users.user_id.isin(user_list)].reset_index(drop = True)
        
    # id to string
    users["user_id"] = users["user_id"].astype(str)
        
    # from string to datetime
    tweets["created_at"] = pd.to_datetime(tweets["created_at"], utc = True)
        
    # Not all users enable their location when tweeting, so
    # we need to check if there are available locations for
    # the tweets returned.
    if "places" in json_tweets['includes'].keys():
        # If the field exists, create a dataframe with the corresponding data
        places = pd.json_normalize(json_tweets['includes']['places']).rename(columns = {"id":"geo_place_id"})
            
        # Drop cols
        #tweets = tweets.drop(['referenced_tweets','edit_history_tweet_ids','geo.place_id'], axis = 1)
        tweets = tweets.drop(['referenced_tweets','edit_history_tweet_ids'], axis = 1)
        return tweets, users, places
        
    else:
        # Drop cols
        tweets = tweets.drop(['referenced_tweets','edit_history_tweet_ids'], axis = 1)
        return tweets, users

In [8]:
# Check if we have tweet's location
if "places" in search_tweet['includes'].keys():
    main_tweets, main_users, main_places = create_dataframes(search_tweet, today)
    
else:
    main_tweets, main_users = create_dataframes(search_tweet, today)
    main_places = pd.DataFrame()
    
main_tweets.head(3)

Unnamed: 0,tweet_id,created_at,possibly_sensitive,lang,author_id,text,geo_place_id,type,ref_tweet_id
0,1587626896996564993,2022-11-02 02:05:23+00:00,False,en,1154111700569853955,England’s provisional 55-man World Cup squad r...,,,
1,1587626894668705794,2022-11-02 02:05:23+00:00,False,en,1491848393613885443,RT @brfootball: Diego Forlán produced one of t...,,retweeted,1.5874634314683228e+18
2,1587626891090964480,2022-11-02 02:05:22+00:00,False,en,1524886624978739200,RT @Bybit_NFT: 🔥 We are so excited to announce...,,retweeted,1.5874822066073928e+18


In [9]:
print(main_tweets.shape)
print(main_users.shape)
#print(main_places.shape)

(99, 9)
(97, 4)


### 2.1 Request more data
The `search_tweets` function was build to only resquest tweets one time, nevertheless, with the `next_token` parameter we can easily request more data. This parameter indicates that there are more "pages" or more results (tweets) that matches the query it was previously sent to Twitter API. If the `next_token` parameter is found in the returned dictionary, then it means there are more results than the ones first returned. If this parameter is missing, then there are no more tweets regarding this topic.

In [11]:
# for i in range(1, 16) # ~750
# for i in range(1, 21): # ~1000
for i in range(1, 41):
    
    # Check if there is a next token (another page)
    # that matches the desired query
    if 'next_token' in search_tweet['meta'].keys():
        print(i, search_tweet["meta"]["next_token"])

        # Collect data from next token
        new_tweets = search_tweets(query = query, next_token = search_tweet['meta']['next_token'])
        search_tweet = new_tweets

        # Check if any tweet has enabled the location,
        # so we can create the places dataframe.
        if "places" in search_tweet['includes'].keys():
            tweets, users, places = create_dataframes(search_tweet, today = today)

            # Append data to main tweets
            main_tweets = main_tweets.append(tweets)
            main_users = main_users.append(users)
            main_places = main_places.append(places)

            # Reset index
            main_tweets = main_tweets.reset_index(drop = True)
            main_users = main_users.reset_index(drop = True)
            main_places = main_places.reset_index(drop = True)

        # If any tweet has its location enabled, then only
        # create the other two dataframes.
        else: 
            tweets, users = create_dataframes(search_tweet, today = today)

            # Append data to main tweets
            main_tweets = main_tweets.append(tweets)
            main_users = main_users.append(users)

            # Reset index
            main_tweets = main_tweets.reset_index(drop = True)
            main_users = main_users.reset_index(drop = True)

    # If there are not more results regarding the
    # requested topic, then just stop requesting 
    # more data.
    else:
        break

1 b26v89c19zqg8o3fpzeme6kppzsjq765fvjcwvenluf0d
2 b26v89c19zqg8o3fpzeme6kpoi8ax7kgivprx5rohcwvx
3 b26v89c19zqg8o3fpzeme6kpoh5xczn1us9ozjo6xiqv1
4 b26v89c19zqg8o3fpzeme6kpmzm1hkqfb4nkwplxf7531
5 b26v89c19zqg8o3fpzeme6kpmyjgcuchobpscr0pns2v1
6 b26v89c19zqg8o3fpzeme6kplgrx0ao1vlnweuse0thml
7 b26v89c19zqg8o3fpzeme6kplfpf5qoxa1nsq63eav9ml
8 b26v89c19zqg8o3fpzeme6kpjy5j99mwdqw9e83e2kr5p
9 b26v89c19zqg8o3fpzeme6kpjwvbsptm9qa1fxagdi4jh
10 b26v89c19zqg8o3fpzeme6kpif3kw9n6ze0no21bwkbr1
11 b26v89c19zqg8o3fpzeme6kpgx4avw8egst546bi3oqd9
12 b26v89c19zqg8o3fpzeme6kpgvue87h5mu4588q0xjfr1
13 b26v89c19zqg8o3fpzeme6kf25chjbtfvpsm86hbzeum5
14 b26v89c19zqg8o3fpzeme6kf24a1tn0i2gz1hjg96xm65
15 b26v89c19zqg8o3fpzeme6kf0mq4v1d1qfriyunm0xxtp
16 b26v89c19zqg8o3fpzeme6kf0lnmzf01e3sw7w7hvd6rh
17 b26v89c19zqg8o3fpzeme6kez43lpkq5qc9y97w2e8y2l
18 b26v89c19zqg8o3fpzeme6kez31aailvw193f7n4evtz1
19 b26v89c19zqg8o3fpzeme6kexlh90ts3anabs7whjlxfh
20 b26v89c19zqg8o3fpzeme6kexkes8927fhkj4l76thg1p
21 b26v89c19zqg8o3fpzeme6kew2

In [12]:
print(main_tweets.shape)
print(main_users.shape)
print(main_places.shape)

(4093, 11)
(3789, 5)
(11, 4)


In [13]:
# Store the data locally
if main_places.empty:
    main_tweets.to_csv(f"/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/tweets/tweets_{today}.csv",
                       index = False)
    main_users.to_csv(f"/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/users/users_{today}.csv",
                      index = False)
    
else:
    main_tweets.to_csv(f"/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/tweets/tweets_{today}.csv",
                       index = False)
    main_users.to_csv(f"/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/users/users_{today}.csv",
                      index = False)
    main_places.to_csv(f"/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/places/places_{today}.csv",
                       index = False)

### 2.2 Clean data
Before implementing the algorithm, we should start by cleaning and pre-processing our data, in this case, the papers csv is already loaded. The pre-processing phase includes the following steps and it's performed with help of the `PreProcessor` class:

- **Remove noise:** Noise removal is about removing characters digits and pieces of text that can interfere with text analysis. Noise removal is one of the most essential text preprocessing steps.


- **Normalize text:** Text normalization is the process of transforming a text into a canonical (standard) form. For example, the word “gooood” and “gud” can be transformed to “good”, its canonical form. 


- **Tokenization:** Tokenization is a way of separating a piece of text into smaller units called tokens. In this case tokens are words (but can also be characters or subwords).


- **Stemming:** Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words (known as a lemma).


- **Lemmatization:** Lemmatization is a method responsible for grouping different inflected forms of words into the root form, having the same meaning. It is similar to stemming.

In [14]:
class PreProcessor:
    
    def __init__(self, regex_dict = None):
        
        # creating classes
        # stem
        self.sb = nltk.stem.SnowballStemmer('english')
        
        # lemmatize
        self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        
        # translate
        self.translator = Translator()
        
        # declare a default regex dict
        self.default_regex_dict = {'goo[o]*d':'good', '2morrow':'tomorrow', 'b4':'before', 'otw':'on the way',
                                   'idk':"i don't know", ':)':'smile', 'bc':'because', '2nite':'tonight',
                                   'yeah':'yes', 'yeshhhhhhhh':'yes', ' yeeeee':'yes', 'btw':'by the way', 
                                   'fyi':'for your information', 'gr8':'great', 'asap':'as soon as possible', 
                                   'yummmmmy':'yummy', 'gf':'girlfriend', 'thx':'thanks','nowwwwwww':'now', 
                                   ' ppl ':' people ', 'yeiii':'yes'}
        
        # if no regex_dict defined by user, then use 
        # one by default. Else, concat two regex dicts
        if regex_dict:            
            self.regex_dict = {**regex_dict, **default_regex_dict}
            
        else:
            self.regex_dict = self.default_regex_dict
    
    def translate_twt(self, pdf):
    
        """
        This function helps to translate a tweet from any 
        language to English.

        Inputs:
            - pdf: Pandas dataframe. This dataframe must have
               the following columns:
                - lang: Tweet's language.
                - clean_tweet: Partially pre-processed tweet.

        Outputs: Translated tweet from any language available 
                 in googletrans api to English.
        """

        # Check if the language of the tweet is either undefined or English
        # to avoid translation.
        if pdf["lang"] == "und" or pdf["lang"] == "en":
            pdf["translated_tweet"] = pdf["clean_tweet"]

        # Check if tweet is in Hindi. The code of Hindi language is "hi", but 
        # Twitter has defined the code as "in".
        elif pdf["lang"] == "in":
            pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = "hi", dest = "en").text
            
        # Check if tweet is in Chinese. 
        # The api supports simplified and traditional chinese.
        elif pdf["lang"] == "zh":
            pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = "zh-cn", dest = "en").text

        # For any other language the translator should work just fine, so the
        # api should work with the language detected by Twitter.
        else:
            try:
                pdf["translated_tweet"] = self.translator.translate(pdf["clean_tweet"], src = pdf["lang"], 
                                                                    dest = "en").text
            except (TypeError, ValueError):
                pdf["translated_tweet"] = pdf["clean_tweet"]
                
        return pdf["translated_tweet"]

    
    def removeNoise(self, pdf):
        
        """
        Function to remove noise from strings. 
        
        Inputs: A pandas dataframe with raw strings of length n.
        
        Output: A clean string where elements such as accented 
        words, html tags, punctuation marks, and extra white 
        spaces will be removed (or transform) if it's the case.
        """
        
        # to lower case
        pdf["clean_tweet"] = pdf.text.apply(lambda x: x.lower())
        
        # remove accented characters from string
        # e.g. canción --> cancion
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: unidecode.unidecode(x))
        
        # remove html tags 
        pdf["clean_tweet"] = pdf.clean_tweet.str.replace(r'<[^<>]*>', '', regex = True)
        
        # remove (match with) usernames | hashtags | punct marks | links
        # punct marks = ",.':!?;
        # do not remove: ' 
        # but remove: "
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x:' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([-.,:_;])|(https?:\/\/.*[\r\n]*)",
                                                                            "", x).split()).replace('"',''))
                
        # remove white spaces at the begining and at 
        # the end of a string
        pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: x.lstrip(' '))
        pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: x.rstrip(' '))
        
        # Translate tweet
        pdf["clean_tweet"] = pdf.apply(lambda x: self.translate_twt(x) \
                                       if pd.isnull(x.clean_tweet) == False else x, axis = 1)
        
        #pdf["clean_tweet"] = pdf.apply(lambda x: self.translate_twt(x) \
        #                               if (pd.isnull(x.clean_tweet) == False \
        #                                   and x.clean_tweet != "") else x, axis = 1)    
        
        # normalize string
        # normalize accented charcaters and other strange characters
        # NFKD if there are accented characters (????
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: unicodedata.normalize('NFKC', x)\
                                                   .encode('ASCII', 'ignore').decode("utf-8")\
                                                   if (pd.isnull(x) == False and x != "") else x)
        
        return pdf
    
    
    def textNormalization(self, pdf):
        """
        Function to normalize a string. 
        
        Inputs: A pandas dataframe with strings (of length n) that 
        will be normalized. 
        
        Outputs: A normalized string whitout noise, words in their
        (expected) correct form and with no stopwords.
        """
        
        # remove noise first
        pdf = self.removeNoise(pdf)

        # expand contractions
        # e.g. don't --> do not
        pdf['clean_tweet'] = pdf.clean_tweet.apply(lambda x: contractions.fix(x))
 
        # Normalize words
        pdf['clean_tweet'] = pdf.clean_tweet.replace(self.regex_dict)
                
        # get English stopwords    
        stop_words = stopwords.words('english')
        stopwords_dict = Counter(stop_words)
        
        # remove stopwords from string
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: ' '.join([word for word in x.split()
                                                                       if word not in stopwords_dict]))
            
        return pdf
    
    
    def wordTokenize(self, pdf):
        """
        Function to tokenize a string into words. Tokenization is a way 
        of separating a piece of text into smaller units called tokens.
        In this case tokens are words (but can also be characters or 
        subwords).
        
        Inputs: A pandas dataframe with strings (of length n) that will be tokenized. 
        
        Outputs: A list of tokenized words.
        """
        # string normalized
        #normalized = self.textNormalization(string)
        pdf = self.textNormalization(pdf)
        
        # Use word_tokenize method to split the string
        # into individual words. By default it returns
        # a list.
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: nltk.word_tokenize(x))        
        
        # Using isalpha() will help us to only keep
        # items from the alphabet (no punctuation
        # marks). 
        #pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [word for word in x if word.isalpha()])
        
        # Keep only unique elements
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: list(set(x)))

        # return list of tokenized words by row
        return pdf
    
    def phraseTokenize(self, pdf):
        
        """
        Function to tokenize a string into sentences. Tokenization is
        a way of separating a piece of text into smaller units called
        tokens. In this case tokens are phrases (but can also be words,
        characters or subwords).
        
        Inputs: A string (of length n) that will be tokenized. 
        
        Outputs: A list of tokenized sentences.
        """
        
        # pandas dataframe with strings normalized
        pdf = self.textNormalization(pdf)
        
        # Use sent_tokenize method to split the string
        # into sentences. By default it returns a list.
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: nltk.sent_tokenize(x))   
        
        return pdf 
    
    
    def stemWords(self, pdf):
        
        """
        Function to stem strings. Stemming is the process of reducing
        a word to its word stem that affixes to suffixes and prefixes 
        or to the roots of words (known as a lemma).
        
        Inputs: A raw string of length n.
        
        Output: Roots of each word of a given string.
        """
        
        # pandas dataframe with strings normalized
        pdf = self.textNormalization(pdf)
        
        # tokenized string (into words)
        pdf = self.wordTokenize(data)
            
        # reduct words to its root    
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [self.sb.stem(word) for word in x])
        
        return pdf
    
    
    def lemmatizeWords(self, pdf):
        
        """
        Function to lemmatize strings. Lemmatization is a method 
        responsible for grouping different inflected forms of 
        words into the root form, having the same meaning. It is 
        similar to stemming.
        
        Inputs: A raw string of length n.
        
        Output: Roots of each word of a given string (with better
        performance than in stemming).
        """
        unw_chars = ["(", ")", "[", "]"]
        
        # pandas dataframe with strings normalized
        pdf = self.textNormalization(pdf)
        
        # list of tokenized words (from string)
        # Here it was decided to tokenize by words
        # rather than by sentences due to it might
        # be easier to find the correct roots
        # of each word
        pdf = self.wordTokenize(pdf)
        
        # lematize word from list of tokenized words
        #lematized = [self.lemmatizer.lemmatize(word) for word in tokenized]
        pdf["clean_tweet"] = pdf.clean_tweet.apply(lambda x: [self.lemmatizer.lemmatize(word) 
                                                              for word in x if word not in unw_chars])
        
        return pdf

In [15]:
# update values
main_tweets.loc[main_tweets["possibly_sensitive"] == False, "possibly_sensitive"] = 0
main_tweets.loc[main_tweets["possibly_sensitive"] == True, "possibly_sensitive"] = 1

# get unique tweets ids from referenced tweets that were retweeted
#ref_tweets = main_tweets.ref_tweet_id.unique().tolist()
ref_tweets = main_tweets[main_tweets["type"] == "retweeted"].ref_tweet_id.unique()

# get unique tweets ids from original tweets
og_tweets = main_tweets.tweet_id.unique().tolist()

# get unique tweets that have been referenced
# i.e., retweeted, quoted, etc
both = [i for i in ref_tweets if i in og_tweets]

# generate a new dataframe without tweets that have referenced other tweets
# in order to avoid extra processing when cleaning the tweets
sample_df = main_tweets[~main_tweets["ref_tweet_id"].isin(og_tweets)]

# keep necessary columns
sample_df = sample_df[["tweet_id", "text", "lang"]].reset_index(drop = True)
print(sample_df.shape)
sample_df.head(2)

(3939, 3)


Unnamed: 0,tweet_id,text,lang
0,1587626896996564993,England’s provisional 55-man World Cup squad r...,en
1,1587626894668705794,RT @brfootball: Diego Forlán produced one of t...,en


In [16]:
# Create class object
pre_processor = PreProcessor()

# Clean data and only keep the roots of each word.
sample_df = pre_processor.lemmatizeWords(sample_df)
sample_df.head()

Unnamed: 0,tweet_id,text,lang,clean_tweet
0,1587626896996564993,England’s provisional 55-man World Cup squad r...,en,"[welbeck, brighton, newcastle, pair, striker, ..."
1,1587626894668705794,RT @brfootball: Diego Forlán produced one of t...,en,"[2010, individual, rt, forlan, ever, mast, die..."
2,1587626891090964480,RT @Bybit_NFT: 🔥 We are so excited to announce...,en,"[drop, 100, next, worth, rt, celebration, anno..."
3,1587626890419904514,@AFC_Fazeel ENGLAND HAVE NO CHANCE IN THIS WOR...,en,"[win, needed, cuptheir, league, idea, intensit..."
4,1587626877472374784,I’ve filled out the World Cup 2022 bracket for...,en,"[win, bracket, filled, 20000, chance, cup, I, ..."


In [17]:
# merge main dataframe with sample df to get the clean tweet
main_tweets = main_tweets.merge(sample_df[["tweet_id", "clean_tweet"]],
                                how = "left", on = "tweet_id")
print(main_tweets.shape)
main_tweets.head(2)

(4093, 12)


Unnamed: 0,tweet_id,created_at,possibly_sensitive,lang,author_id,text,geo_place_id,type,ref_tweet_id,withheld.copyright,withheld.country_codes,clean_tweet
0,1587626896996564993,2022-11-02 02:05:23+00:00,0,en,1154111700569853955,England’s provisional 55-man World Cup squad r...,,,,,,"[welbeck, brighton, newcastle, pair, striker, ..."
1,1587626894668705794,2022-11-02 02:05:23+00:00,0,en,1491848393613885443,RT @brfootball: Diego Forlán produced one of t...,,retweeted,1.5874634314683228e+18,,,"[2010, individual, rt, forlan, ever, mast, die..."


In [19]:
if "withheld.copyright" and "withheld.country_codes" in main_tweets.columns:
    main_tweets = main_tweets.drop(["withheld.copyright", "withheld.country_codes"], axis = 1)
    print(main_tweets.columns)

Index(['tweet_id', 'created_at', 'possibly_sensitive', 'lang', 'author_id',
       'text', 'geo_place_id', 'type', 'ref_tweet_id', 'clean_tweet'],
      dtype='object')


In [20]:
for tweet in both:
    # get the clean tweet from the original tweet
    clean_tweet = str(list(main_tweets.loc[(main_tweets["tweet_id"] == tweet), "clean_tweet"])[0])
    
    # assign the clean tweet to the "clean_tweet" column
    main_tweets.loc[(main_tweets["ref_tweet_id"] == tweet), "clean_tweet"] = clean_tweet

# delete rows where text couldn't be cleanned
main_tweets = main_tweets.dropna(subset = ["clean_tweet"]).shape
main_tweets.head()

Unnamed: 0,tweet_id,created_at,possibly_sensitive,lang,author_id,text,geo_place_id,type,ref_tweet_id,clean_tweet
0,1587626896996564993,2022-11-02 02:05:23+00:00,0,en,1154111700569853955,England’s provisional 55-man World Cup squad r...,,,,"[welbeck, brighton, newcastle, pair, striker, ..."
1,1587626894668705794,2022-11-02 02:05:23+00:00,0,en,1491848393613885443,RT @brfootball: Diego Forlán produced one of t...,,retweeted,1.5874634314683228e+18,"[2010, individual, rt, forlan, ever, mast, die..."
2,1587626891090964480,2022-11-02 02:05:22+00:00,0,en,1524886624978739200,RT @Bybit_NFT: 🔥 We are so excited to announce...,,retweeted,1.5874822066073928e+18,"[drop, 100, next, worth, rt, celebration, anno..."
3,1587626890419904514,2022-11-02 02:05:22+00:00,0,en,218768171,@AFC_Fazeel ENGLAND HAVE NO CHANCE IN THIS WOR...,,replied_to,1.5870042607047844e+18,"[win, needed, cuptheir, league, idea, intensit..."
4,1587626877472374784,2022-11-02 02:05:18+00:00,0,en,1555016766774181888,I’ve filled out the World Cup 2022 bracket for...,,,,"[win, bracket, filled, 20000, chance, cup, I, ..."


In [21]:
# Store the data locally
main_tweets.to_csv(f'/home/bmartin/Documents/github_repos/tweets_nlp_visualizations/data/clean_tweets/clean_tweets_{today}.csv', index = False)