In [46]:
# Import libraries
import os
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
import csv

# Open file and save as corpus
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')
with open("GoldmanSachsTweets.txt", "r", encoding='utf-8-sig') as f:
    gs_string = f.read()

with open("HomeDepotTweets.txt", "r", encoding='utf-8-sig') as f:
    hd_string = f.read()

with open("AppleTweets.txt", "r", encoding='utf-8-sig') as f:
    ap_string = f.read()

In [47]:
## Define clean_tweets function to clean tweets

def clean_tweets(text_string):

    # Lower tweets
    text_string = text_string.lower()

    # Remove hashtags and mentions
    text_string = re.sub("@[A-Za-z0-9_]+","", text_string)
    text_string = re.sub("#[A-Za-z0-9_]+","", text_string)

    # Remove links
    text_string = re.sub(r"http\S+", "", text_string)
    text_string = re.sub(r"www.\S+", "", text_string)

    # Remove punctuation
    text_string = re.sub('[()!?]', ' ', text_string)
    text_string = re.sub('\[.*?\]',' ', text_string)

    # Remove non-alpha numbers
    text_string = re.sub("[^a-z]"," ", text_string)

    # Tokenize tweets
    text_string = text_string.split()

    ## STOP WORDS

    # Set stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    new_stopwords = ["false", "na", "true", "href", "rt", "twitter", "rel", "nofollow", "rr", "x"] # add tweets-specific stopwords
    stop_words = nltk.corpus.stopwords.words('english') 
    stop_words.extend(new_stopwords)

    # Filter out stopwords
    text_string = [w for w in text_string if not w.lower() in stop_words]
    
    # Filter out words that only occurred once
    once = [x for x in text_string if text_string.count(x) == 1]
    for word in list(text_string):  # iterating on a copy since removing will mess things up
        if word in once:
            text_string.remove(word)

    ## LEMMATIZE

    # Download necessary packages/libraries
    nltk.download('wordnet')
    nltk.download('omw-1.4')

    lemma = WordNetLemmatizer()
    text_string = '\n'.join([lemma.lemmatize(w) for w in text_string])

    # Split
    text_string = text_string.split()
    print(text_string)



In [48]:
# Goldman Sachs Tweets: Apply clean_tweets to gs_string
gs_string = clean_tweets(gs_string)

['goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'like', 'big', 'like', 'deal', 'wang', 'qishan', 'good', 'goldman', 'morgan', 'stanley', 'm', 'surprised', 'market', 'q', 'result', 'goldman', 'sachs', 'changed', 'rating', 'hold', 'morgan', 'goldman', 'sachs', 'billion', 'market', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'uk', 'property', 'could', 'last', 'month', 'announces', 'b', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'sept', 'cpi', 'estimate', 'credit', 'suisse', 'barclays', 'bank', 'america', 'goldman', 'sachs', 'jp', 'morgan', 'mo', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'sept', 'cpi', 'estimate', 'credit', 'suisse', 'barclays', 'bank', 'america', 'goldman', 'sachs', 'jp', 'morgan', 'mo', 'news', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman',

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [49]:
# Home Depot Tweets: Apply home_depot to d_string
hd_string = clean_tweets(hd_string)

['koch', 'industry', 'american', 'crystal', 'sugar', 'home', 'depot', 'boeing', 'ups', 'home', 'depot', 'backed', 'hershel', 'walker', 'million', 'please', 'shop', 'lowe', 'hobby', 'lobby', 'chick', 'fil', 'home', 'depot', 'founder', 'home', 'depot', 'trump', 'supporter', 'said', 'president', 'biden', 'worse', 'jimmy', 'carter', 'cu', 'tweeted', 'home', 'depot', 'tweeted', 'think', 'w', 'time', 'home', 'depot', 'lowe', 'hobby', 'lobby', 'chick', 'home', 'depot', 'support', 'word', 'co', 'founder', 'home', 'depot', 'dumped', 'million', 'herschel', 'baby', 'daddy', 'factory', 'walker', 'odi', 'word', 'co', 'founder', 'home', 'depot', 'dumped', 'million', 'herschel', 'baby', 'daddy', 'factory', 'walker', 'odi', 'dividend', 'growth', 'stock', 'favorite', 'stock', 'getting', 'paid', 'every', 'month', 'jpm', 'jpmorgan', 'chase', 'ab', 'home', 'depot', 'backed', 'hershel', 'walker', 'million', 'please', 'shop', 'lowe', 'home', 'depot', 'backed', 'hershel', 'walker', 'million', 'please', 'shop

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [50]:
# Apple Tweets: Apply clean_tweets to ap_string
ap_string = clean_tweets(ap_string)

['email', 'want', 'something', 'must', 'approach', 'slowly', 'calmly', 'piece', 'apple', 'carrot', 'apple', 'key', 'social', 'security', 'benefit', 'rise', 'percent', 'washington', 'chris', 'amp', 'back', 'fall', 'apple', 'cider', 'like', 'report', 'made', 'trump', 'podcast', 'good', 'morning', 'miss', 'apple', 'time', 'year', 'amp', 'day', 'amp', 'great', 'williams', 'report', 'new', 'check', 'discus', 'earn', 'app', 'would', 'like', 'pro', 'joe', 'mo', 'thick', 'thin', 'everyone', 'need', 'count', 'democratic', 'party', 'joined', 'exist', 'anymore', 'today', 'dem', 'party', 'complete', 'control', 'elitist', 'c', 'confirm', 'io', 'communicate', 'apple', 'service', 'outside', 'active', 'vpn', 'tunnel', 'worse', 'leak', 'dns', 'request', 'song', 'apple', 'music', 'say', 'trudeau', 'need', 'love', 'year', 'old', 'claim', 'doms', 'like', 'sure', 'ya', 'sport', 'let', 'get', 'apple', 'juice', 'democratic', 'party', 'joined', 'exist', 'anymore', 'today', 'dem', 'party', 'complete', 'control

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [36]:
# # ## Count vectorizer

# # Initialize count vectorizer
# vectorizer=CountVectorizer()   

# # Run count vectorizer
# Xs  =  vectorizer.fit_transform(filtered_temp)   
# print(type(Xs))

# ## EXPLORE THE OBJECT ATTRIBUTES

# # # View vocabulary dictionary
# print("vocabulary = ",vectorizer.vocabulary_)

# # col_names
# col_names=vectorizer.get_feature_names_out()

<class 'scipy.sparse.csr.csr_matrix'>
vocabulary =  {'goldman': 49, 'sachs': 100, 'laporta': 71, 'tomorrow': 121, 'like': 73, 'big': 12, 'deal': 33, 'wang': 128, 'qishan': 92, 'good': 51, 'morgan': 82, 'stanley': 114, 'surprised': 117, 'market': 77, 'result': 98, 'changed': 21, 'rating': 94, 'hold': 56, 'billion': 13, 'uk': 124, 'property': 91, 'could': 26, 'last': 72, 'month': 81, 'announces': 6, 'sept': 104, 'cpi': 29, 'estimate': 40, 'credit': 30, 'suisse': 116, 'barclays': 10, 'bank': 9, 'america': 3, 'jp': 67, 'mo': 80, 'news': 84, 'scotia': 101, 'nomura': 85, 'bmo': 14, 'cibc': 23, 'bofa': 15, 'get': 45, 'make': 76, 'hydrogen': 58, 'generation': 44, 'grow': 53, 'trillion': 123, 'per': 89, 'year': 131, 'going': 47, 'one': 87, 'amp': 4, 'loses': 75, 'gain': 43, 'scotiabank': 102, 'soon': 109, 'view': 127, 'day': 32, 'job': 66, 'hiredinny': 55, 'urrent': 125, 'activity': 1, 'indicator': 61, 'give': 46, 'country': 27, 'economy': 38, 'know': 69, 'hour': 57, 'reckon': 96, 'sac': 99, 'm

In [37]:
## SAVE FILES
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')
rows = zip(filtered_temp) # turn columns into rows
with open('cleaned_goldman_sachs_tweets.csv', 'w') as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)
    f.write(','.join(filtered_temp))