# Cleaning Record Data (Tweets) in Python

In [1]:
## Import libraries
import os
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
import csv
import pandas as pd

# Set file path
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')

# Open files and save as corpus
with open("GoldmanSachsTweets.txt", "r", encoding='utf-8-sig') as f:
    gs = f.read()

with open("HomeDepotTweets.txt", "r", encoding='utf-8-sig') as f:
    hd = f.read()

with open("AppleTweets.txt", "r", encoding='utf-8-sig') as f:
    ap = f.read()

with open("accenturetweets.txt", "r", encoding='utf-8-sig') as f:
    ac = f.read()

with open("BMSTweets.txt", "r", encoding='utf-8-sig') as f:
    bms = f.read()

gs = gs.splitlines()
hd = hd.splitlines()
ap = ap.splitlines()
ac = ac.splitlines()
bms = bms.splitlines()

In [2]:
# Convert to dataframe

gs = pd.DataFrame(gs)
ap = pd.DataFrame(ap)
hd = pd.DataFrame(hd)
ac = pd.DataFrame(ac)
bms = pd.DataFrame(bms)

In [3]:
# Remove rows that are not Tweets
# I notice all rows starting the first occurrence of the word "FALSE" do not contain tweets. 

# gs
gs_remove_starting = gs[gs[0] == 'FALSE'].index
gs = gs.drop(gs.index[gs_remove_starting[0]:]) # Remove rows after the first FALSE

# ap
ap_remove_starting = ap[ap[0] == 'FALSE'].index
ap = ap.drop(ap.index[ap_remove_starting[0]:])

# hd
hd_remove_starting = hd[hd[0] == 'FALSE'].index
hd = hd.drop(hd.index[hd_remove_starting[0]:])

# ac
ac_remove_starting = ac[ac[0] == 'FALSE'].index
ac = ac.drop(ac.index[ac_remove_starting[0]:])

# bms
bms_remove_starting = bms[bms[0] == 'FALSE'].index
bms = bms.drop(bms.index[bms_remove_starting[0]:])

In [11]:
# Create function that cleans tweets

def clean_tweets(temp):
        temp = temp.lower()
        temp = re.sub(r'@[A-Za-z0-9]+', '', temp)

        # Remove hashtags and mentions
        temp = re.sub("@[A-Za-z0-9_]+","", temp)
        temp = re.sub("#[A-Za-z0-9_]+","", temp)

        # Remove links
        temp = re.sub(r"http\S+", "", temp)
        temp = re.sub(r"www.\S+", "", temp)

        # Remove punctuation
        temp = re.sub('[()!?]', ' ', temp)
        temp = re.sub('\[.*?\]',' ', temp)

        # Remove non-alpha numbers
        temp = re.sub("[^a-z]"," ", temp)

        # # Tokenize tweets
        temp = temp.split()

        ## STOP WORDS

        # Set stopwords
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        new_stopwords = ["false", "na", "true", "href", "rt", "twitter", "rel", "nofollow", "rr", "x"] # add tweets-specific stopwords
        stop_words = nltk.corpus.stopwords.words('english') 
        stop_words.extend(new_stopwords)

        # Filter stopwords
        filtered_temp = [w for w in temp if not w.lower() in stop_words]

        ## LEMMATIZE
        nltk.download('wordnet')
        nltk.download('omw-1.4')

        lemma = WordNetLemmatizer()

        filtered_temp = ' '.join([lemma.lemmatize(w) for w in filtered_temp])
        
        return filtered_temp


# Apply function to various corpa and save as new variables

# Goldman Sachs Tweets: Apply clean_tweets to gs_string
gs_clean = gs[0].apply(lambda x: clean_tweets(x))

# Home Depot Tweets: Apply home_depot to d_string
hd_clean = hd[0].apply(lambda x: clean_tweets(x))

# Apple Tweets: Apply clean_tweets to ap_string
ap_clean = ap[0].apply(lambda x: clean_tweets(x))

# Accenture Tweets: Apply clean_tweets to ap_string
acc_clean = ac[0].apply(lambda x: clean_tweets(x))

# BMS Tweets: Apply clean_tweets to ap_string
bms_clean = bms[0].apply(lambda x: clean_tweets(x))

gs_clean

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-

0                         goldman sachs laporta tomorrow
1                         goldman sachs laporta tomorrow
2      hagnews leader like financial big shark like d...
3                  wang qishan girlfriend gao yanyan sit
4      apology sir john good former minister successf...
                             ...                        
893                                        credit suisse
894                                         bank america
895                                                 cibc
896    student short term anxious long term confident...
897    breaking news goldman sachs research best cryp...
Name: 0, Length: 898, dtype: object

In [12]:
# Count vectorizer to examine word frequency

# Initialize count vectorizer
vectorizer=CountVectorizer()   #generates matrix where columns are words and rows are word frequencies in each tweet

# Run count vectorizer
Xs  =  vectorizer.fit_transform(gs_clean)
print(type(Xs))

## EXPLORE THE OBJECT ATTRIBUTES

# # View vocabulary dictionary
print("vocabulary = ",vectorizer.vocabulary_)

# col_names
col_names=vectorizer.get_feature_names_out()

<class 'scipy.sparse.csr.csr_matrix'>


In [23]:
## SAVE FILES

# Set file path
full_path = os.path.realpath("cleaning_py.ipynb")
path, filename = os.path.split(full_path)
os.chdir(path)
os.chdir('../data')

# Goldman Sachs
gs_clean.to_csv('cleaned_goldman_sachs_tweets.csv')

# Home Depot
hd_clean.to_csv('cleaned_home_depot_tweets.csv')

# Apple
ap_clean.to_csv('cleaned_apple_tweets.csv')

# Accenture
acc_clean.to_csv('cleaned_accenture_tweets.csv')

# BMS
bms_clean.to_csv('cleaned_bms_tweets.csv')