# Text Data Cleaning
This section was completed using Python.

# Twitter Sentiment Analysis Data
We will now clean the Sentiment Analysis Data. 

In [38]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import shutil
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array

In [39]:
# Reading in the data
df = pd.read_csv('../../../data/00-raw-data/Tweets.csv')
df_subset = df[['text','sentiment']]
df_subset = df_subset.dropna()
df_subset = df_subset.reset_index(drop=True) 

In [40]:
print(df_subset.shape)
print(df_subset.columns)

(27480, 2)
Index(['text', 'sentiment'], dtype='object')


Now, we will go ahead and clean our tweets up by removing unneccesary characters and changes in capitalizations.

In [41]:
tweets=[]
y=[]
#ITERATE OVER ROWS
# for i in range(0,10):  
for i in range(0,len(df_subset)):
    # QUICKLY CLEAN TEXT
    keep="abcdefghijklmnopqrstuvwxyz "
    replace=".,!;"
    tmp=""
    for char in df_subset["text"][i].replace("<br />","").lower():
        if char in replace:
            tmp+=" "
        if char in keep:
            tmp+=char
    tmp=" ".join(tmp.split())
    tweets.append(tmp)
    # CONVERT STRINGS TO INT TAGS
    if(df_subset["sentiment"][i]=="positive"):
        y.append(1)
    if(df_subset["sentiment"][i]=="negative"):
        y.append(0)
    if(df_subset["sentiment"][i]=="neutral"):
        y.append(2)
    

    #PRINT FIRST COUPLE TWEETS
    if(i<3):
        print(i)
        print(df_subset["text"][i].replace("<br />",""),'\n')
        print(tmp)
    

0
 I`d have responded, if I were going 

id have responded if i were going
1
 Sooo SAD I will miss you here in San Diego!!! 

sooo sad i will miss you here in san diego
2
my boss is bullying me... 

my boss is bullying me


In [42]:
#DOUBLE CHECK SIZE
y=np.array(y)
print(len(tweets),len(y))

27480 27480


Now, we will get a CountVectorizer up and going so we further format our tweets for text classification later.

In [43]:
# I have to do 1000 features, nothing too much more, otherwise my kernel crashes
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(max_features=1000,stop_words="english")    
Xs  =  vectorizer.fit_transform(tweets)   

Let's format our data!

In [44]:
#CONVERT TO ONE-HOT VECTORS (can also be done with binary=true in CountVectorizer)
X=np.array(Xs.todense())

In [8]:
maxs=np.max(X,axis=0)

In [9]:
x = np.ceil(X/maxs)

In [10]:
vocab0 = vectorizer.vocabulary_

In [11]:
print(x.shape,y.shape)

(27480, 1000) (27480,)


In [12]:
#swap keys and values (value --> ley)
vocab1 = dict([(value, key) for key, value in vocab0.items()])

In [13]:
# CHECK VOCAB KEY-VALUE PAIRS
print(list(vocab1.keys())[0:10])
print(list(vocab1.values())[0:10])


[420, 329, 772, 706, 552, 432, 474, 81, 409, 60]
['id', 'going', 'sooo', 'sad', 'miss', 'interview', 'leave', 'bought', 'httpwww', 'best']


In [14]:
#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW 
df2=pd.DataFrame(x)
s = df2.sum(axis=0)
df2=df2[s.sort_values(ascending=False).index[:]]
print(df2.head())

   424  449  189  332  482  210  856  509  961  329  ...  405  555  532  427  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   657  187  749  87   166  893  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 1000 columns]


In [15]:
# RENAME COLUMNS 0,1,2,3 .. 
df2.columns = range(df2.columns.size)
print(df2.head())
print(df2.sum(axis=0))
x=df2.to_numpy()

   0    1    2    3    4    5    6    7    8    9    ...  990  991  992  993  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   994  995  996  997  998  999  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 1000 columns]
0      2909.0
1      2214.0
2      1964.0
3      1505.0
4      1295.0
        ...  
995      24.0
996      24.0
997      24.0
998      23.0
999      23.0
Length: 1000, dtype: float64


In [16]:
# REMAP DICTIONARY TO CORRESPOND TO NEW COLUMN NUMBERS
print()
i1=0
vocab2={}
for i2 in list(df2.columns):
    # print(i2)
    vocab2[i1]=vocab1[int(i2)]
    i1+=1




In [17]:
print(x.shape,y.shape)

(27480, 1000) (27480,)


Now we will export our cleaned data to our data folder which we can access later in the Naive Bayes text classification section.

In [36]:
import csv
# TWEETS
csv_file_path = "../../../data/01-modified-data/tweet.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in x:
        writer.writerow(row)



NameError: name 'x' is not defined

In [25]:
# SENTIMENT
csv_file_path = "../../../data/01-modified-data/sentiment.csv"
np.savetxt(csv_file_path, y, delimiter=',')



# Using Twitter API to Pull in HKJC Tweets
Now we will access the Twitter API in order to get Twitter data directly relating to HJKC, specfically the horse Golden Sixty. We will be using the tweepy package for Python.

In [61]:
import tweepy

bearer_token = 'AAAAAAAAAAAAAAAAAAAAADBsrQEAAAAAgwcAblxSw4ZubJQxyCOK5bbOh5w%3Dz6b9oNnbsJpfpLXZM0avhOqRtDJqHb9DdZi6vFZVWcfJgSCyvs'

client = tweepy.Client(bearer_token)

# Search Tweets
query = "Golden Sixty"
tweets = client.search_recent_tweets(query=query, max_results=100)

for tweet in tweets.data:
    print(tweet.text)
    if len(tweet.context_annotations) > 0:
        print(tweet.context_annotations)

df = pd.DataFrame(tweets.data)

RT @HKJC_Racing: The Mile Championship winner is on the track, Namur, preparing to face Golden Sixty this Sunday! 😍

#ナミュール 🎌 | #競馬 | @LONG…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
RT @HongKong_Racing: 𝑻𝒉𝒆 𝑷𝒓𝒊𝒅𝒆 𝒐𝒇 𝑯𝒐𝒏𝒈 𝑲𝒐𝒏𝒈! 🏅6⃣0⃣

GOLDEN SIXTY sets out to make history with a third G1 Hong Kong Mile triumph.

#HKIR |…
'He's Done His Job': Hong Kong Superstar Golden Sixty, Now An 8-Year-Old, Readies For International Swansong - Horse Racing News | Paulick Report https://t.co/1NxZ35h75Q
@tantric_eden You can be my Santa! Also, you are the biggest golden hearted hoe I have had the pleasure to know. I met my first Ho sixty years ago and in that time I have known a few so I know this to be a fact! https://t.co/GdRCEdhmzb
RT @HKJC_Racing: He has over 550 Hong Kong wins and is Golden Sixty's regular rider... 🌟

Will @Vincenthocy add a first @LONGINES #IJC titl…
RT @HKJC_Racing: 🗣️ "Great train

Now we want to clean this tweet data up, so that it matches the format of the sentiment data which we will use to train a classifier. This cleaning process largely follows the same process as the cleaning for the sentiment data.

In [76]:
df_subset = df[['text','id']]
tweets_golden=[]
y=[]
#ITERATE OVER ROWS
# for i in range(0,10):  
for i in range(0,len(df_subset)):
    # QUICKLY CLEAN TEXT
    keep="abcdefghijklmnopqrstuvwxyz "
    replace=".,!;"
    tmp=""
    for char in df_subset["text"][i].replace("<br />","").lower():
        if char in replace:
            tmp+=" "
        if char in keep:
            tmp+=char
    tmp=" ".join(tmp.split())
    tweets_golden.append(tmp)

og_tweets = pd.DataFrame(tweets_golden)

csv_file_path = "../../../data/01-modified-data/goldensixty_uncleaned.csv"

og_tweets.to_csv(csv_file_path, index=False)

In [63]:
Xs_golden  =  vectorizer.fit_transform(tweets_golden)   

In [64]:
X_golden=np.array(Xs_golden.todense())
maxs_golden=np.max(X_golden,axis=0)
x2 = np.ceil(X_golden/maxs_golden)
vocab0_2 = vectorizer.vocabulary_

In [65]:
#swap keys and values (value --> ley)
vocab1_2 = dict([(value, key) for key, value in vocab0_2.items()])

In [66]:
#RE-ORDER COLUMN SO IT IS SORTED FROM HIGH FREQ TERMS TO LOW 
df2_g=pd.DataFrame(x2)
s_g = df2_g.sum(axis=0)
df2_g=df2_g[s_g.sort_values(ascending=False).index[:]]
print(df2_g.head())

   66   166  124  83   190  116  47   93   239  125  ...  144  123  130  131  \
0  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0  0.0   

   132  136  140  142  143  122  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  1.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  

[5 rows x 240 columns]


In [67]:
# RENAME COLUMNS 0,1,2,3 .. 
df2_g.columns = range(df2_g.columns.size)
print(df2_g.head())
print(df2_g.sum(axis=0))
x_g=df2_g.to_numpy()

   0    1    2    3    4    5    6    7    8    9    ...  230  231  232  233  \
0  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0  0.0   

   234  235  236  237  238  239  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  1.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  

[5 rows x 240 columns]
0      92.0
1      82.0
2      45.0
3      30.0
4      29.0
       ... 
235     1.0
236     1.0
237     1.0
238     1.0
239     1.0
Length: 240, dtype: float64


Now we can export our data to our data folder.

In [69]:
csv_file_path = "../../../data/01-modified-data/goldensixty.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in x_g:
        writer.writerow(row)