In [49]:
# import necessary packages
import os
import glob

import pandas as pd

from nlp_pipeline import *
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from textblob import TextBlob

## Data Cleaning
Before analysis, the tweet text must be processed to limit the number of features to be input into the model as well as transform the text to an optimal format for the model to derive semantic value.

First I will clean each of the dataframes to include only those related to the products and/or event in question.

<font color='red'>Things to come back and add (running list)</font>  
<li><font color='red'>Remove hashtags and @, including characters and text (like usernames)</font></li>
<li><font color='red'>Remove URLs --> remove string contains 'http'</font></li>

In [28]:
# read in dataframe containing tweets of user accounts
user_dfs = []

path_to_json = '/Users/baka_brooks/Documents/metis-projects/project-04/data/user_tweets/'
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file)
    user_dfs.append(data)
    
user_tweets = pd.concat(user_dfs)
user_tweets.reset_index(inplace=True, drop=True)

The data pulled from the media networks and influencers in the tech industry include the last 4000 tweets from each account. To clean this data I will remove retweets and only include tweets within the time range of interest.

In [29]:
# filter retweets
no_retweets = user_tweets[~user_tweets['text'].str.contains("RT")]
no_retweets.shape

(69958, 8)

The tweets pulled based on the related hashtags already only include the tweets surrounding the topics/interest areas I am looking for. I will read in the data and combine with the user tweets, and clean further.

In [30]:
# filter user tweets to only include those directly before and after the event
users_filtered = no_retweets[no_retweets['created_at'] >= '2020-02-01']
users_filtered.shape

(10784, 8)

In [31]:
# read in dataframe containing tweets around event hashtags
hashtag_dfs = []

path_to_json = '/Users/baka_brooks/Documents/metis-projects/project-04/data/hashtag_tweets/'
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file)
    hashtag_dfs.append(data)
    
hashtag_tweets = pd.concat(hashtag_dfs)
hashtag_tweets.reset_index(inplace=True, drop=True)

In [32]:
print(hashtag_tweets.shape)
hashtag_tweets.head()

(14261, 8)


Unnamed: 0,id_str,created_at,screen_name,in_reply_to_status_id_str,in_reply_to_screen_name,favorite_count,retweet_count,text
0,1230537551418658816,2020-02-20 16:59:49+00:00,KenistonHeather,,,12,1,#samsungmembers #loveforgalaxy #unpacked #with...
1,1230465254037037056,2020-02-20 12:12:32+00:00,lunchtimetecht1,,,1,1,Ep. 52: @Apple and @SamsungMobileUS Events and...
2,1230194279554977792,2020-02-19 18:15:46+00:00,OmniGrandiose,1.230062e+18,TPfupa,2,0,@TPfupa @SamsungMobileSA @AudreyMoeng @Samsung...
3,1230144195362525184,2020-02-19 14:56:45+00:00,TheCapeTownGuy,,,34,6,"Samsung announced the Galaxy Z Flip, their new..."
4,1230053630788530176,2020-02-19 08:56:53+00:00,Samsung_CafeBTM,,,0,0,Get ready for the #GalaxyS20? Ortis Deley gets...


In [33]:
hashtag_tweets.created_at.min(), hashtag_tweets.created_at.max()

(Timestamp('2020-02-11 03:48:15+0000', tz='UTC'),
 Timestamp('2020-02-21 02:38:02+0000', tz='UTC'))

The tweets gathered via the hashtag range from February 11th to February 21st. This range is the day the event occurred, until the date the data was gathered.

### Merge dataframes and clean

In [34]:
# merge dataframes
df = pd.concat([users_filtered, hashtag_tweets], ignore_index=True)
print(df.shape)
df.head()

(25045, 8)


Unnamed: 0,id_str,created_at,screen_name,in_reply_to_status_id_str,in_reply_to_screen_name,favorite_count,retweet_count,text
0,1230661597519138816,2020-02-21 01:12:44+00:00,MKBHD,1.230659e+18,AlexRCamacho1,851,7,@AlexRCamacho1 By not shipping it
1,1230648790526386176,2020-02-21 00:21:50+00:00,MKBHD,,,8204,297,I ordered the Escobar Fold 1.\nNever got it.\n...
2,1230581789619519488,2020-02-20 19:55:36+00:00,MKBHD,1.230581e+18,harshb_,25,1,@harshb_ @SuperSaf @beebomco @howtomen @verge ...
3,1230566950989942784,2020-02-20 18:56:38+00:00,MKBHD,1.230534e+18,AlijahSimon,170,2,@AlijahSimon @jon_prosser Android n customizat...
4,1230535104214294528,2020-02-20 16:50:05+00:00,MKBHD,1.230534e+18,andrewmartonik,245,2,@andrewmartonik It’s possible my scale isn’t p...


I noticed in the `text` column there are newline characters, '\n'. These need to be removed before moving on to pre-processing.

In [35]:
# remove newline characters
df = df.replace('\n','', regex=True)

In [36]:
# remove duplicates
df = df.drop_duplicates()
df.shape

(23375, 8)

In [37]:
# export cleaned data to a pickle file
df.to_pickle("original_data.pkl")

## Data Pre-Processing
Now that the data is cleaned, I can build a corpus from the tweet text and use the NLP pipeline I created to clean, tokenize, stem, and vectorize the text.

In [38]:
# extract the corpus from the tweet dataframe
text = df['text']

In [39]:
text.head()

0                    @AlexRCamacho1 By not shipping it
1    I ordered the Escobar Fold 1.Never got it.I or...
2    @harshb_ @SuperSaf @beebomco @howtomen @verge ...
3    @AlijahSimon @jon_prosser Android n customizat...
4    @andrewmartonik It’s possible my scale isn’t p...
Name: text, dtype: object

### Process characters
The first step in processing the tweets is to clean the text. This will be done with the following steps:
- Remove numbers.
- Remove special characters, like '\n'.
- Remove punctuation.
- Convert to all lowercase letters.
- Remove double spaces.  

Once each tweet is cleaned, the text can be stemmed and tokenized to be ready for modeling.

In [40]:
def process_text(docs):
    """
    ---
    :param tweets: Pandas series of texts to pre-process.
    :return: Pandas series of cleaned text.
    """
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    text_nonewline = lambda x: re.sub('\n', '', x)
    text_nopunct = lambda x: ''.join([char for char in x if char not in string.punctuation])
    text_lower = lambda x: x.lower()
    text_nodoublespace = lambda x: re.sub('\s+', ' ', x).strip()
    
    return docs.map(text_nonum).map(text_nonewline).map(text_nopunct).map(text_lower).map(text_nodoublespace)

In [41]:
text_clean = process_text(df['text'])
text_clean

0                          alexrcamacho by not shipping it
1        i ordered the escobar fold never got iti order...
2        harshb supersaf beebomco howtomen verge cnet m...
3        alijahsimon jonprosser android n customization...
4        andrewmartonik it’s possible my scale isn’t pe...
                               ...                        
25038    today galaxy s will be presented that means my...
25039    galaxys leaks right now ​ httpstcosvcgaxfqa vi...
25040    download links unlocked for everyone samsungev...
25042    samsung galaxy s and s plus launching todaycli...
25044    in honor of the galaxys launch heres a few pho...
Name: text, Length: 23375, dtype: object

In [42]:
# input text into original dataframe and export cleaned dataframe
cleaned_df = df.copy()
cleaned_df['text'] = text_clean

cleaned_df.to_pickle("corpus.pkl")

### Stemming & Tokenization
The next step is to stem each character in each tweet to its root word and to tokenize each tweet by word. During the tokenization phase, stop words will also need to be removed in order to maximize semantic value.

In [43]:
# apply the tokenizer to each tweet individually
porter_stemmer = PorterStemmer()
tweet_tokenizer = TweetTokenizer()

cleaned_text = []
for tweet in text_clean:
    cleaned_words = []
    for word in tweet_tokenizer.tokenize(tweet):
        stemmed_word = porter_stemmer.stem(word)
        cleaned_words.append(stemmed_word)
    cleaned_text.append(' '.join(cleaned_words))

In [44]:
cleaned_text[:10]

['alexrcamacho by not ship it',
 'i order the escobar fold never got iti order the escobar fold never got it would not recommend',
 'harshb supersaf beebomco howtomen verg cnet mrwhosetheboss androidcentr sundarpichai android … httpstcoayplprqq',
 'alijahsimon jonpross android n custom n stuff',
 'andrewmartonik it ’ s possibl my scale isn ’ t perfect ¯ ツ ¯',
 'arnaudducouret same weight',
 'andrewmartonik they ’ re actual both gram just weigh them',
 's ultra in the hous next to an iphon pro max that a big boi httpstcomvhxerw',
 'dnut teslarati shiagif',
 'helpertesla teslarati 😭']

In [55]:
type(cleaned_text)

list

### Document-Term Matrix
Now that the tweets are fully processed, I will create a document-term matrix. I will create a count vector as well as a TF-IDF vector and compare performance of each model.

#### Count Vectorizer

In [53]:
labels = [tweet[:30]+'...' for tweet in cleaned_text]

countVectorizer = CountVectorizer(stop_words='english')
doc_word_count = countVectorizer.fit_transform(cleaned_text)
dtm_count = pd.DataFrame(doc_word_count.toarray(), index=labels, columns=countVectorizer.get_feature_names())
dtm_count.head(10)

Unnamed: 0,aa,aaaaa,aaaaah,aaaaand,aaaawwww,aadmi,aakarsha,aamaadmiparti,aapkidilli,aapl,...,𝗮𝘄𝗮𝘆,𝗳𝗮𝗰𝘂𝗹𝘁𝘆,𝗳𝗹𝘂𝘅,𝗶𝘀,𝗻𝗼𝘁,𝘁𝗮𝗸𝗲𝘀,𝘁𝗵𝗮𝘁,𝘂𝘀𝗲𝗱,𝙃enlivenphtoalltheboy,𝙒𝙍𝙄enlivenphthirdlookat
alexrcamacho by not ship it...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
i order the escobar fold never...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
harshb supersaf beebomco howto...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alijahsimon jonpross android n...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
andrewmartonik it ’ s possibl ...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
arnaudducouret same weight...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
andrewmartonik they ’ re actua...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s ultra in the hous next to an...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dnut teslarati shiagif...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
helpertesla teslarati 😭...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# export document-term matrix to pickle file
dtm_count.to_pickle("dtm_count.pkl")

#### TF-IDF Vectorizer

In [21]:
tfidfVectorizer = TfidfVectorizer(stop_words='english')
doc_word_tfidf = tfidfVectorizer.fit_transform(cleaned_text)
dtm_tfidf = pd.DataFrame(doc_word_tfidf.toarray(), index=labels, columns=tfidfVectorizer.get_feature_names())
dtm_tfidf.head(10)

Unnamed: 0,aa,aaaaa,aaaaah,aaaaand,aaaawwww,aadmi,aakarsha,aamaadmiparti,aapkidilli,aapl,...,𝗮𝘄𝗮𝘆,𝗳𝗮𝗰𝘂𝗹𝘁𝘆,𝗳𝗹𝘂𝘅,𝗶𝘀,𝗻𝗼𝘁,𝘁𝗮𝗸𝗲𝘀,𝘁𝗵𝗮𝘁,𝘂𝘀𝗲𝗱,𝙃enlivenphtoalltheboy,𝙒𝙍𝙄enlivenphthirdlookat
alexrcamacho by not ship it...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i order the escobar fold never...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harshb supersaf beebomco howto...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alijahsimon jonpross android n...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
andrewmartonik it ’ s possibl ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arnaudducouret same weight...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
andrewmartonik they ’ re actua...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s ultra in the hous next to an...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dnut teslarati shiagif...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
helpertesla teslarati 😭...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# export document-term matrix to pickle file
dtm_tfidf.to_pickle("dtm_tfidf.pkl")