In [66]:
# Import libraries
import os
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
import csv

# Set file path
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')

# Open files and save as corpus
with open("GoldmanSachsTweets.txt", "r", encoding='utf-8-sig') as f:
    gs_string = f.read()

with open("HomeDepotTweets.txt", "r", encoding='utf-8-sig') as f:
    hd_string = f.read()

with open("AppleTweets.txt", "r", encoding='utf-8-sig') as f:
    ap_string = f.read()

In [67]:
def clean_tweets(string):
    temp = string.lower()

    # Remove hashtags and mentions
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)

    # Remove links
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub(r"www.\S+", "", temp)

    # Remove punctuation
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)

    # Remove non-alpha numbers
    temp = re.sub("[^a-z]"," ", temp)

    # Tokenize tweets
    temp = temp.split()

    ## STOP WORDS

    # Set stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    new_stopwords = ["false", "na", "true", "href", "rt", "twitter", "rel", "nofollow", "rr", "x"] # add tweets-specific stopwords
    stop_words = nltk.corpus.stopwords.words('english') 
    stop_words.extend(new_stopwords)

    # Filter stopwords
    filtered_temp = [w for w in temp if not w.lower() in stop_words]
    
    # filtered_temp = []

    # for w in temp:
    #     if w not in stop_words:
    #         filtered_temp.append(w)

    # Filter out words that only occurred once
    once = [x for x in filtered_temp if filtered_temp.count(x) == 1]
    for word in list(filtered_temp):  # iterating on a copy since removing will mess things up
        if word in once:
            filtered_temp.remove(word)

    ## LEMMATIZE
    nltk.download('wordnet')
    nltk.download('omw-1.4')

    lemma = WordNetLemmatizer()

    filtered_temp = '\n'.join([lemma.lemmatize(w) for w in filtered_temp])

    filtered_temp = filtered_temp.split()
    return filtered_temp

In [68]:
# ## Define clean_tweets function to clean tweets

# def clean_tweets(text_string):

#     # Lower tweets
#     text_string = text_string.lower()

#     # Remove hashtags and mentions
#     text_string = re.sub("@[A-Za-z0-9_]+","", text_string)
#     text_string = re.sub("#[A-Za-z0-9_]+","", text_string)

#     # Remove links
#     text_string = re.sub(r"http\S+", "", text_string)
#     text_string = re.sub(r"www.\S+", "", text_string)

#     # Remove punctuation
#     text_string = re.sub('[()!?]', ' ', text_string)
#     text_string = re.sub('\[.*?\]',' ', text_string)

#     # Remove non-alpha numbers
#     text_string = re.sub("[^a-z]"," ", text_string)

#     # Tokenize tweets
#     text_string = text_string.split()

#     ## STOP WORDS

#     # Set stopwords
#     nltk.download('stopwords')
#     stop_words = set(stopwords.words('english'))
#     new_stopwords = ["false", "na", "true", "href", "rt", "twitter", "rel", "nofollow", "rr", "x"] # add tweets-specific stopwords
#     stop_words = nltk.corpus.stopwords.words('english') 
#     stop_words.extend(new_stopwords)

#     # Filter out stopwords
#     text_string = [w for w in text_string if not w.lower() in stop_words]
    
#     # Filter out words that only occurred once
#     once = [x for x in text_string if text_string.count(x) == 1]
#     for word in list(text_string):  # iterating on a copy since removing will mess things up
#         if word in once:
#             text_string.remove(word)

#     ## LEMMATIZE

#     # Download necessary packages/libraries
    
#     nltk.download('wordnet')
#     nltk.download('omw-1.4')

#     lemma = WordNetLemmatizer()
#     text_string = '\n'.join([lemma.lemmatize(w) for w in text_string])

#     # Split
#     text_string = text_string.split()
#     print(text_string)



In [69]:
# Goldman Sachs Tweets: Apply clean_tweets to gs_string
gs_list = clean_tweets(gs_string)

# Home Depot Tweets: Apply home_depot to d_string
hd_list = clean_tweets(hd_string)

# Apple Tweets: Apply clean_tweets to ap_string
ap_list = clean_tweets(ap_string)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-

In [70]:
print(gs_list)

['goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'like', 'big', 'like', 'deal', 'wang', 'qishan', 'good', 'goldman', 'morgan', 'stanley', 'm', 'surprised', 'market', 'q', 'result', 'goldman', 'sachs', 'changed', 'rating', 'hold', 'morgan', 'goldman', 'sachs', 'billion', 'market', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'uk', 'property', 'could', 'last', 'month', 'announces', 'b', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'sept', 'cpi', 'estimate', 'credit', 'suisse', 'barclays', 'bank', 'america', 'goldman', 'sachs', 'jp', 'morgan', 'mo', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman', 'sachs', 'laporta', 'tomorrow', 'sept', 'cpi', 'estimate', 'credit', 'suisse', 'barclays', 'bank', 'america', 'goldman', 'sachs', 'jp', 'morgan', 'mo', 'news', 'goldman', 'sachs', 'goldman', 'sachs', 'laporta', 'tomorrow', 'goldman',

In [71]:
# # ## Count vectorizer

# # Initialize count vectorizer
# vectorizer=CountVectorizer()   

# # Run count vectorizer
# Xs  =  vectorizer.fit_transform(filtered_temp)   
# print(type(Xs))

# ## EXPLORE THE OBJECT ATTRIBUTES

# # # View vocabulary dictionary
# print("vocabulary = ",vectorizer.vocabulary_)

# # col_names
# col_names=vectorizer.get_feature_names_out()

In [72]:
## SAVE FILES

# Set file path
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')

# Goldman Sachs
rows = zip(gs_list) # turn columns into rows
with open('cleaned_goldman_sachs_tweets.csv', 'w') as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)
    f.write(','.join(gs_list))

# Home Depot
rows = zip(hd_list) # turn columns into rows
with open('cleaned_home_depot_tweets.csv', 'w') as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)
    f.write(','.join(hd_list))

# Apple
rows = zip(ap_list) # turn columns into rows
with open('cleaned_apple_tweets.csv', 'w') as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)
    f.write(','.join(ap_list))