In [302]:
# import necessary packages
import os
import glob

import pandas as pd

from nlp_pipeline import *
import spacy
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Data Cleaning
Before analysis, the tweet text must be processed to limit the number of features to be input into the model as well as transform the text to an optimal format for the model to derive semantic value.

First I will clean each of the dataframes to include only those related to the products and/or event in question.

In [303]:
# read in dataframe containing tweets of user accounts
user_dfs = []

path_to_json = '/Users/baka_brooks/Documents/metis-projects/project-04/data/user_tweets/'
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file)
    user_dfs.append(data)
    
user_tweets = pd.concat(user_dfs)
user_tweets.reset_index(inplace=True, drop=True)

The data pulled from the media networks and influencers in the tech industry include the last 4000 tweets from each account. To clean this data I will remove retweets and only include tweets within the time range of interest.

In [304]:
# filter retweets
no_retweets = user_tweets[~user_tweets['text'].str.contains("RT")]
no_retweets.shape

(69958, 8)

The tweets pulled based on the related hashtags already only include the tweets surrounding the topics/interest areas I am looking for. I will read in the data and combine with the user tweets, and clean further.

In [305]:
# filter user tweets to only include those directly before and after the event
users_filtered = no_retweets[no_retweets['created_at'] >= '2020-02-01']
users_filtered.shape

(10784, 8)

In [306]:
# read in dataframe containing tweets around event hashtags
hashtag_dfs = []

path_to_json = '/Users/baka_brooks/Documents/metis-projects/project-04/data/hashtag_tweets/'
json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

for file in file_list:
    data = pd.read_json(file)
    hashtag_dfs.append(data)
    
hashtag_tweets = pd.concat(hashtag_dfs)
hashtag_tweets.reset_index(inplace=True, drop=True)

In [307]:
print(hashtag_tweets.shape)
hashtag_tweets.head()

(14261, 8)


Unnamed: 0,id_str,created_at,screen_name,in_reply_to_status_id_str,in_reply_to_screen_name,favorite_count,retweet_count,text
0,1230537551418658816,2020-02-20 16:59:49+00:00,KenistonHeather,,,12,1,#samsungmembers #loveforgalaxy #unpacked #with...
1,1230465254037037056,2020-02-20 12:12:32+00:00,lunchtimetecht1,,,1,1,Ep. 52: @Apple and @SamsungMobileUS Events and...
2,1230194279554977792,2020-02-19 18:15:46+00:00,OmniGrandiose,1.230062e+18,TPfupa,2,0,@TPfupa @SamsungMobileSA @AudreyMoeng @Samsung...
3,1230144195362525184,2020-02-19 14:56:45+00:00,TheCapeTownGuy,,,34,6,"Samsung announced the Galaxy Z Flip, their new..."
4,1230053630788530176,2020-02-19 08:56:53+00:00,Samsung_CafeBTM,,,0,0,Get ready for the #GalaxyS20? Ortis Deley gets...


In [308]:
hashtag_tweets.created_at.min(), hashtag_tweets.created_at.max()

(Timestamp('2020-02-11 03:48:15+0000', tz='UTC'),
 Timestamp('2020-02-21 02:38:02+0000', tz='UTC'))

The tweets gathered via the hashtag range from February 11th to February 21st. This range is the day the event occurred, until the date the data was gathered.

### Merge dataframes and clean

In [309]:
# merge dataframes
df = pd.concat([users_filtered, hashtag_tweets], ignore_index=True)
print(df.shape)
df.head()

(25045, 8)


Unnamed: 0,id_str,created_at,screen_name,in_reply_to_status_id_str,in_reply_to_screen_name,favorite_count,retweet_count,text
0,1230661597519138816,2020-02-21 01:12:44+00:00,MKBHD,1.230659e+18,AlexRCamacho1,851,7,@AlexRCamacho1 By not shipping it
1,1230648790526386176,2020-02-21 00:21:50+00:00,MKBHD,,,8204,297,I ordered the Escobar Fold 1.\nNever got it.\n...
2,1230581789619519488,2020-02-20 19:55:36+00:00,MKBHD,1.230581e+18,harshb_,25,1,@harshb_ @SuperSaf @beebomco @howtomen @verge ...
3,1230566950989942784,2020-02-20 18:56:38+00:00,MKBHD,1.230534e+18,AlijahSimon,170,2,@AlijahSimon @jon_prosser Android n customizat...
4,1230535104214294528,2020-02-20 16:50:05+00:00,MKBHD,1.230534e+18,andrewmartonik,245,2,@andrewmartonik It’s possible my scale isn’t p...


I noticed in the `text` column there are newline characters, '\n'. These need to be removed before moving on to pre-processing.

In [310]:
# remove newline characters
df = df.replace('\n','', regex=True)

In [311]:
# remove duplicates
df = df.drop_duplicates()
df.shape

(23375, 8)

In [312]:
# export cleaned data to a pickle file
df.to_pickle("original_data.pkl")

## Data Pre-Processing
Now that the data is cleaned, I can build a corpus from the tweet text and use the NLP pipeline I created to clean, tokenize, stem, and vectorize the text.

In [313]:
# extract the corpus from the tweet dataframe
tweets = df['text']

In [314]:
tweets[:5]

0                    @AlexRCamacho1 By not shipping it
1    I ordered the Escobar Fold 1.Never got it.I or...
2    @harshb_ @SuperSaf @beebomco @howtomen @verge ...
3    @AlijahSimon @jon_prosser Android n customizat...
4    @andrewmartonik It’s possible my scale isn’t p...
Name: text, dtype: object

<font color='blue'><h2>TOKENIZATION WITH SPACY FIRST</h2></font>

In [315]:
nlp = spacy.load('en')

# tokenize the documents
tokens = []
for tweet in tweets[:5]:
    doc = nlp(tweet)
    token = [token.text for token in doc]
    tokens.append(token)

In [316]:
tokens    

[['@AlexRCamacho1', 'By', 'not', 'shipping', 'it'],
 ['I',
  'ordered',
  'the',
  'Escobar',
  'Fold',
  '1.Never',
  'got',
  'it',
  '.',
  'I',
  'ordered',
  'the',
  'Escobar',
  'Fold',
  '2.Never',
  'got',
  'it.0/10',
  'would',
  'not',
  'recommend'],
 ['@harshb',
  '_',
  '@SuperSaf',
  '@beebomco',
  '@howtomen',
  '@verge',
  '@CNET',
  '@Mrwhosetheboss',
  '@androidcentral',
  '@sundarpichai',
  '@Android',
  '…',
  'https://t.co/1AyPLPRQQ6'],
 ['@AlijahSimon',
  '@jon_prosser',
  'Android',
  'n',
  'customization',
  'n',
  'stuff'],
 ['@andrewmartonik',
  'It',
  '’s',
  'possible',
  'my',
  'scale',
  'is',
  'n’t',
  'perfect',
  ' ',
  '¯\\_(ツ)_/¯']]

<font color='red'><h2>TweetTokenizer</h2></font>

In [317]:
tknzr = TweetTokenizer(strip_handles=True)

In [318]:
tokens = []
for tweet in tweets[:5]:
    tokens.append(tknzr.tokenize(tweet.strip()))

In [319]:
tokens

[['By', 'not', 'shipping', 'it'],
 ['I',
  'ordered',
  'the',
  'Escobar',
  'Fold',
  '1.Never',
  'got',
  'it',
  '.',
  'I',
  'ordered',
  'the',
  'Escobar',
  'Fold',
  '2.Never',
  'got',
  'it',
  '.',
  '0/10',
  'would',
  'not',
  'recommend'],
 ['…', 'https://t.co/1AyPLPRQQ6'],
 ['Android', 'n', 'customization', 'n', 'stuff'],
 ['It',
  '’',
  's',
  'possible',
  'my',
  'scale',
  'isn',
  '’',
  't',
  'perfect',
  '¯',
  '\\',
  '_',
  '(',
  'ツ',
  ')',
  '_',
  '/',
  '¯']]

### Process characters
The first step in processing the tweets is to clean the text. This will be done with the following steps:
- Remove URLs and hyperlinks.
- Remove @ names.
- Remove numbers.
- Remove special characters, like '\n'.
- Remove punctuation.
- Convert to all lowercase letters.
- Remove double spaces.  
- Remove elongated words.
- Remove mentions of the brand and phone names.

For the first round of cleaning, I will deal with some of the more Twitter-specific processing; removing hyperlinks, usernames, and hashtags.

In [320]:
def process_text_one(docs):
    """
    ---
    :param docs: Pandas series of texts to pre-process.
    :return: Pandas series of cleaned text.
    """
    # remove URLs and hyperlinks
    text_nourl = lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x)
    # remove @ names
    text_noname = lambda x: re.sub('(@[A-Za-z0-9_]+)', '', x)
    # remove hashtags
    text_nohash = lambda x: re.sub('(#[A-Za-z0-9_]+)', '', x)
    
    return docs.map(text_nourl).map(text_noname).map(text_nohash)

In [321]:
new_text = process_text_one(tweets)
new_text[:10]

0                                   By not shipping it
1    I ordered the Escobar Fold 1.Never got it.I or...
2                                                   … 
3                      Android n customization n stuff
4      It’s possible my scale isn’t perfect  ¯\_(ツ)_/¯
5                                         Same weight!
6     They’re actually both 222 grams, just weighed...
7    S20 Ultra in the house. Next to an iPhone 11 P...
8                                             Shia.gif
9                                                    😭
Name: text, dtype: object

For the second round of text processing I will handle the characters. This includes punctuations, numbers, new line characters, and double spaces. I will also convert all of the text to lowercase to enable simpler processing.

In [322]:
def process_text_two(docs):
    """
    ---
    :param docs: Pandas series of texts to pre-process.
    :return: Pandas series of cleaned text.
    """
    
    # remove numbers
    text_nonum = lambda x: re.sub(r'\d+', '', x)
    # remove the new line character
    text_nonewline = lambda x: re.sub('\n', '', x)
    # remove punctuation
    text_nopunct = lambda x: ''.join([char for char in x if char not in string.punctuation])
    # convert all letters to lowercase
    text_lower = lambda x: x.lower()
    # substitute multiple spaces with single space
    text_nospaces = lambda x: re.sub(r'\s+', ' ', x, flags=re.I)
    # remove all single characters
    text_single = lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x)
    
    return docs.map(text_nonum).map(text_nonewline).map(text_nopunct).map(text_lower).map(text_nospaces).map(text_single)

In [323]:
text_cleaned = process_text_two(new_text)
text_cleaned[:10]

0                                   by not shipping it
1    i ordered the escobar fold never got iti order...
2                                                   … 
3                          android customization stuff
4             it’s possible my scale isn’t perfect ¯ツ¯
5                                          same weight
6        they’re actually both grams just weighed them
7    s ultra in the house next to an iphone pro max...
8                                              shiagif
9                                                    😭
Name: text, dtype: object

Ah, did you think the cleaning was over? Now that the characters have been processed I must account for "unnatural language", AKA elongated words. The beauty of Twitter is that it is also a hotbed of slang and repeating characters, think "aaaaaahhh" and "wooooooow". Before each tweet can be analyzed, I will create a function to handle these cases.

In [324]:
def process_text_three(docs):  
    """
    ---
    :param docs: Pandas series of texts to pre-process.
    :return: Pandas series of cleaned text.
    """
    counter = 0
    ascii_lowercase = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
                      'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    # replace 'aaaaaaaahhhhh' with 'aahh'
    for letter in ascii_lowercase:
        for row_idx, doc in enumerate(docs):
            for word_idx, word in enumerate(doc):
                original_word = word
                while word != word.replace(letter*3, letter*2):
                    word = word.replace(letter*3, letter*2) 
                    docs[row_idx][word_idx] = word
    return docs

In [325]:
text_cleaner = process_text_three(text_cleaned)
text_cleaner[:10]

0                                   by not shipping it
1    i ordered the escobar fold never got iti order...
2                                                   … 
3                          android customization stuff
4             it’s possible my scale isn’t perfect ¯ツ¯
5                                          same weight
6        they’re actually both grams just weighed them
7    s ultra in the house next to an iphone pro max...
8                                              shiagif
9                                                    😭
Name: text, dtype: object

The last step in preparing the data to model is to remove mentions of the brand and phone names. Because I have already removed numerics, I will have to alter my cleaning strategy a bit to remove the correct characters.

In [326]:
def process_text_four(docs):
    """
    ---
    :param docs: Pandas series of texts to pre-process.
    :return: Pandas series of cleaned text.
    """
    stop_words = ['samsung', 'galaxy', 's ', ' s', 'plus', 'ultra', 'z', 'flip', 'unpacked']
    
    new_docs = []
    for doc in docs:
        new_doc = ''.join([word for word in doc if word not in stop_words])
        new_docs.append(new_doc.strip())
    
    return new_docs

In [327]:
text_cleanest = process_text_four(text_cleaner)
text_cleanest[:10]

['by not shipping it',
 'i ordered the escobar fold never got iti ordered the escobar fold never got it would not recommend',
 '…',
 'android customiation stuff',
 'it’s possible my scale isn’t perfect ¯ツ¯',
 'same weight',
 'they’re actually both grams just weighed them',
 's ultra in the house next to an iphone pro max thats big boi',
 'shiagif',
 '😭']

Now that each tweet is cleaned, the text can be stemmed and tokenized to be ready for modeling.

### Tokenization & Lemmatization
The next step is to lemmatize each character in each tweet to its root word and to tokenize each tweet by word. In the next step when creating the document-term matrix, stop words will also need to be removed in order to maximize semantic value.

In [328]:
wordNetLemmatizer = WordNetLemmatizer()
lemmatized_tweets = [wordNetLemmatizer.lemmatize(text) for text in text_cleanest]

In [329]:
# input text into original dataframe and export cleaned dataframe
cleaned_df = df.copy()
cleaned_df['text'] = lemmatized_tweets

cleaned_df.to_pickle("corpus.pkl")

### Document-Term Matrix
Now that the tweets are fully processed, I will create a document-term matrix. I will create a count vector as well as a TF-IDF vector and compare performance of each model.

#### Count Vectorizer

In [333]:
labels = [tweet[:30]+'...' for tweet in cleaned_df.text]

count_vectorizer = CountVectorizer(stop_words='english')
doc_word_count = count_vectorizer.fit_transform(cleaned_df.text)
dtm_count = pd.DataFrame(doc_word_count.toarray(), index=labels, columns=count_vectorizer.get_feature_names())
dtm_count.head(10)

Unnamed: 0,aa,aaaaa,aaaaah,aaaaand,aaaawwww,aadmi,aakarsha,aapl,aaron,aayush,...,𝗮𝗻𝘆,𝗮𝘄𝗮𝘆,𝗳𝗮𝗰𝘂𝗹𝘁𝘆,𝗳𝗹𝘂𝘅,𝗶𝘀,𝗻𝗼𝘁,𝘁𝗮𝗸𝗲𝘀,𝘁𝗵𝗮𝘁,𝘂𝘀𝗲𝗱,𝙒𝙍𝙄
by not shipping it...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
i ordered the escobar fold nev...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
…...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
android customiation stuff...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
it’s possible my scale isn’t p...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
same weight...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
they’re actually both grams ju...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s ultra in the house next to a...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
shiagif...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
😭...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [334]:
# export document-term matrix to pickle file
dtm_count.to_pickle("dtm_count.pkl")

#### TF-IDF Vectorizer

In [335]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
doc_word_tfidf = tfidf_vectorizer.fit_transform(cleaned_df.text)
dtm_tfidf = pd.DataFrame(doc_word_tfidf.toarray(), index=labels, columns=tfidf_vectorizer.get_feature_names())
dtm_tfidf.head(10)

Unnamed: 0,aa,aaaaa,aaaaah,aaaaand,aaaawwww,aadmi,aakarsha,aapl,aaron,aayush,...,𝗮𝗻𝘆,𝗮𝘄𝗮𝘆,𝗳𝗮𝗰𝘂𝗹𝘁𝘆,𝗳𝗹𝘂𝘅,𝗶𝘀,𝗻𝗼𝘁,𝘁𝗮𝗸𝗲𝘀,𝘁𝗵𝗮𝘁,𝘂𝘀𝗲𝗱,𝙒𝙍𝙄
by not shipping it...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i ordered the escobar fold nev...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
android customiation stuff...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
it’s possible my scale isn’t p...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
same weight...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
they’re actually both grams ju...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s ultra in the house next to a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
shiagif...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
😭...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [336]:
# export document-term matrix to pickle file
dtm_tfidf.to_pickle("dtm_tfidf.pkl")