In [9]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [11]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/arnold/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/arnold/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/arnold/nltk_data...


True

In [3]:
query = "(insurance OR insured OR insurer OR underwriter OR insure) -is:retweet near:Nairobi within:30km lang:en since:2020-01-01 until:2022-12-31"
tweets_list = []

In [4]:
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
   tweets_list.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username])

In [5]:
insurance_list = pd.DataFrame(tweets_list,columns =["Date","Id","Content","Username"])
insurance_list.count()

Date        2860
Id          2860
Content     2860
Username    2860
dtype: int64

In [6]:
insurance_list["Date"] = pd.to_datetime(insurance_list["Date"])
insurance_list.sort_values(by="Date",ascending=True,inplace=True)
insurance_list.head()

Unnamed: 0,Date,Id,Content,Username
2859,2020-01-02 17:30:13+00:00,1212788197161406469,"@MarvinGakunyi It has alot of humility in it, ...",dougladjuma
2858,2020-01-02 19:54:08+00:00,1212824415848804356,"HEALTHCARE FINANCING, @nhifkenya, ENHANCED NHI...",simonkigondu
2857,2020-01-02 20:23:28+00:00,1212831799052771328,This Cancer Insurance Cover by @icealion is wh...,drthuranira
2856,2020-01-03 11:18:51+00:00,1213057126802870273,@AutoxpressKenya to the rescue ‚öíÔ∏è üöó after Tyre...,PhilipOgola
2855,2020-01-04 07:51:29+00:00,1213367330010488838,I know this may seem mean but I‚Äôm finding it R...,Sheilakari


In [14]:
#Tweets preprocessing
insurance_list = insurance_list.dropna()
insurance_list = insurance_list.drop_duplicates()

#removing urls
cleaned_content = insurance_list['Content'].apply(lambda x:re.sub(r'https?:\/\/.*[\r\n]*','',x))

#extracting hashtags
hashtags = cleaned_content.apply(lambda x: re.findall(r"#(\w+)",x))

#Inserting space after period
cleaned_content = cleaned_content.apply(lambda x: re.sub(r'([a-z])\.([A-Z])', r'\1. \2', x))

#removing punctuation and whitespaces
cleaned_content = cleaned_content.apply(lambda x: re.sub(r'\n\s\s+','',x)).apply(lambda x: re.sub(r'[^\w\s]','',x))

#Tokenizing tweets
tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
cleaned_content = cleaned_content.apply(lambda x : tokenizer.tokenize(x))

#Removing stopwords
stopwords_en = stopwords.words("english")

cleaned_content = cleaned_content.apply(lambda words: [item for item in words if item not in stopwords_en])

#Lemmatizing the content
wlm = WordNetLemmatizer()
cleaned_content = cleaned_content.apply(lambda x: [wlm.lemmatize(word) for word in x ])
cleaned_content.head()

2859    [marvingakunyi, alot, humility, hope, insuranc...
2858    [healthcare, financing, nhifkenya, enhanced, n...
2857    [cancer, insurance, cover, icealion, call, dis...
2856    [autoxpresskenya, rescue, tyre, burst, tyre, i...
2855    [know, may, seem, mean, im, finding, really, h...
Name: Content, dtype: object

In [15]:
insurance_list['Cleaned Content'] = cleaned_content
insurance_list['Hashtags'] = hashtags

In [17]:
insurance_list.head()

Unnamed: 0,Date,Id,Content,Username,Cleaned Content,Hashtags
2859,2020-01-02 17:30:13+00:00,1212788197161406469,"@MarvinGakunyi It has alot of humility in it, ...",dougladjuma,"[marvingakunyi, alot, humility, hope, insuranc...",[]
2858,2020-01-02 19:54:08+00:00,1212824415848804356,"HEALTHCARE FINANCING, @nhifkenya, ENHANCED NHI...",simonkigondu,"[healthcare, financing, nhifkenya, enhanced, n...",[UHC]
2857,2020-01-02 20:23:28+00:00,1212831799052771328,This Cancer Insurance Cover by @icealion is wh...,drthuranira,"[cancer, insurance, cover, icealion, call, dis...",[]
2856,2020-01-03 11:18:51+00:00,1213057126802870273,@AutoxpressKenya to the rescue ‚öíÔ∏è üöó after Tyre...,PhilipOgola,"[autoxpresskenya, rescue, tyre, burst, tyre, i...","[SocialGood, DigitalHumanitarian, ItCouldBeYou]"
2855,2020-01-04 07:51:29+00:00,1213367330010488838,I know this may seem mean but I‚Äôm finding it R...,Sheilakari,"[know, may, seem, mean, im, finding, really, h...",[]
