# Code for cleaning Opinion Tweet Data

Cleaner function adapted from Professor James Hickman's wikipedia crawler

In [31]:
# Import relevant packages
import pandas as pd
import numpy as np
import nltk
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('words')

# Import raw tweet data
op_df = pd.read_csv("../../data/00-raw-data/Opinion-Tweets-1011.csv",index_col=0,encoding="unicode_escape")
words = set(nltk.corpus.words.words())

# Initialize models
sid = SentimentIntensityAnalyzer()
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\alexp\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [32]:
# Define stop words
mystopwords = stopwords.words('english')
# Append frequently used words from NYCT twitter that are not useful for analysis
mystopwords.extend(["Northbound","Southbound","southbound","northbound","both","directions","while","running","problems","delays","delayed","delay","st","ave","av","rd","train","trains","near",'i','d','s','t','need'])
mystopwords = set(mystopwords)

In [44]:
# ADAPTED FROM PROFESSOR HICKMAN'S CLEANER FUNCTION
# Define tweet cleaning function
def twtclean(tweet):
	new_text=""
    # Chars to keep
	keep=" abcdefghijklmnopqrstuvwxyz0123456789"
	for char in tweet:
		if char.lower() in keep:
			new_text+=char.lower()
		else: 
			new_text+=" "
	tweet=new_text
	
	# Filter stop words out
	new_text=""
	for w in nltk.tokenize.word_tokenize(tweet):
		if w not in mystopwords:
			lem = lemmatizer.lemmatize(w)
			w = lem
			if len(w)>1:
				if w in [".",",","!","?",":",";"]:
					#remove the last space
					new_text=new_text[0:-1]+w+" "
				else: #add a space
					new_text+=w.lower()+" "
	tweet=new_text.strip()

	# Filter out non words
	new_text = ""
	for w in nltk.tokenize.word_tokenize(tweet):
		if w in words:
			new_text += w.lower() + " "
	tweet=new_text.strip()
	return tweet

## Extract sentiment analysis and clean full text of tweets

In [50]:
# Convert text of tweets to list
full_text = op_df['full_text'].tolist()
# Create corpus to be filled with cleaned tweets and list of sentiments
corpus = []
sents = []
# For loop partially adapted from Professor James Hickman's example
for tweet in full_text:
    tweet = twtclean(tweet)
    # Append to corpus
    corpus.append(tweet)
    # Sentiment analysis
    score = sid.polarity_scores(tweet)
    new_sent = score['compound']
    # Extract pos/neu/neg label
    new_max = 0
    new_label = ""
    for i in range(0,3):
        val = abs(list(score.values())[i])
        if val > new_max:
            new_max = val
            new_label = list(score)[i]
    sents.append([new_sent,new_label])
    #print(score)
    #print(new_label)

# Save to csv
new_df = []
for i in range(0,len(corpus)):
    new_df.append([corpus[i],sents[i][0],sents[i][1]])
df=pd.DataFrame(new_df)
df=df.rename(columns={0: "text", 1: "sentiment", 2: "category"})
# Drop any rows with NA values
df['text'].replace('', np.nan, inplace=True)
df = df.dropna()
df.to_csv('../../data/01-modified-data/NYCT-Opinion-Tweets-Sentiments.csv',index=False)

## One Hot Encoding (Document Term Frequency Matrix)

In [None]:
# Convert text of tweets to list
tweets = op_df['full_text'].tolist()
# Remove urls from text
tweets = [re.sub(r'http\S+', '', x) for x in tweets]
# Use CountVectorizer to tokenize tweets
nyct_cvec = CountVectorizer(stop_words=mystopwords,min_df=0.001)
nyct_mx = nyct_cvec.fit_transform(tweets)
nyct_array = nyct_mx.toarray()
# Create document term frequency matrix
nyct_dtm = pd.DataFrame(data=nyct_array,columns=nyct_cvec.get_feature_names_out())
# Save dtm to csv
#nyct_dtm.to_csv('../../data/01-modified-data/NYCT-0901-0914-tweets-DTM.csv')
nyct_dtm.to_csv('../../data/01-modified-data/Opinion-tweets-DTM.csv')

In [None]:
# Save to html
os.system('jupyter nbconvert --to html NYCT-Opinion-Tweet-Cleaning.ipynb')

0