Preprocessing of data

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk

from glob import glob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df = pd.read_csv('/content/drive/MyDrive/output.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45194 entries, 0 to 45193
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              45194 non-null  int64  
 1   id                      45194 non-null  int64  
 2   created_at              45194 non-null  object 
 3   entities                45192 non-null  object 
 4   referenced_tweets       41352 non-null  object 
 5   possibly_sensitive      45194 non-null  bool   
 6   public_metrics          45194 non-null  object 
 7   author_id               45194 non-null  int64  
 8   context_annotations     44793 non-null  object 
 9   edit_controls           45194 non-null  object 
 10  conversation_id         45194 non-null  int64  
 11  edit_history_tweet_ids  45194 non-null  object 
 12  lang                    45194 non-null  object 
 13  reply_settings          45194 non-null  object 
 14  text                    45194 non-null

In [10]:
df['text'][35]

'RT @AnnyeongOnglee: [ONGLEE SCHEDULE] 01 MAY 2023\n\nMarvel Studios’ Guardians of the Galaxy \nVol.3 Thailand Gala Premiere\n\n⏰ งานเริ่ม 18.00…'

In [11]:
# Remove URLs, user mentions, non-alphanumeric characters and hashtags from the tweets
df['text'] = df['text'].apply(lambda x: re.sub(r'#\S+', '', str(x))) # remove hashtags
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x))) # remove non-alphanumeric characters
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', str(x))) # remove URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'@\S+', '', str(x))) # remove user mentions

In [12]:
df['text'][35]

'RT AnnyeongOnglee ONGLEE SCHEDULE 01 MAY 2023\n\nMarvel Studios Guardians of the Galaxy \nVol3 Thailand Gala Premiere\n\n  1800'

In [13]:
# Remove punctuation and convert text to lowercase
df['text'] = df['text'].apply(lambda x: re.sub('[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: x.lower())

In [14]:
df['text'][35]

'rt annyeongonglee onglee schedule 01 may 2023\n\nmarvel studios guardians of the galaxy \nvol3 thailand gala premiere\n\n  1800'

In [15]:
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [16]:
df['text'][35]

'rt annyeongonglee onglee schedule 01 may 2023 marvel studios guardians galaxy vol3 thailand gala premiere 1800'

In [17]:
# Remove any remaining white space
df['text'] = df['text'].apply(lambda x: x.strip())

In [19]:
df['text'][35]

'rt annyeongonglee onglee schedule 01 may 2023 marvel studios guardians galaxy vol3 thailand gala premiere 1800'

In [18]:
df.insert(loc=6, column='text_tokens', value=df['text'].apply(lambda x: word_tokenize(x)))


In [20]:
df['text'][33], df['text_tokens'][33]


('rt marvelstudios experience guardians galaxy vol 3 theaters may 5 get tickets http',
 ['rt',
  'marvelstudios',
  'experience',
  'guardians',
  'galaxy',
  'vol',
  '3',
  'theaters',
  'may',
  '5',
  'get',
  'tickets',
  'http'])

In [21]:
df.head(3)


Unnamed: 0.1,Unnamed: 0,id,created_at,entities,referenced_tweets,possibly_sensitive,text_tokens,public_metrics,author_id,context_annotations,...,lang,reply_settings,text,author,__twarc,attachments,in_reply_to_user_id,in_reply_to_user,geo,withheld
0,0,1652915575360987136,2023-05-01 05:59:17+00:00,"{'annotations': [{'start': 48, 'end': 56, 'pro...","[{'type': 'retweeted', 'id': '1652870466284183...",False,"[rt, marvelffightus, rush, check, guardians, c...","{'retweet_count': 143, 'reply_count': 0, 'like...",1573278611897196544,"[{'domain': {'id': '46', 'name': 'Business Tax...",...,en,everyone,rt marvelffightus rush check guardians come ba...,"{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,,,
1,1,1652915467718373376,2023-05-01 05:58:51+00:00,"{'annotations': [{'start': 36, 'end': 44, 'pro...","[{'type': 'retweeted', 'id': '1652711911857745...",False,"[rt, marvelstudios, friday, guardians, back, e...","{'retweet_count': 454, 'reply_count': 0, 'like...",1372656867004928000,"[{'domain': {'id': '46', 'name': 'Business Tax...",...,en,everyone,rt marvelstudios friday guardians back experie...,"{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,,,
2,2,1652915360587186176,2023-05-01 05:58:26+00:00,"{'annotations': [{'start': 37, 'end': 65, 'pro...","[{'type': 'retweeted', 'id': '1652033181942423...",False,"[rt, discussingfilm, new, clip, guardians, gal...","{'retweet_count': 970, 'reply_count': 0, 'like...",1552644578850783232,"[{'domain': {'id': '46', 'name': 'Business Tax...",...,en,everyone,rt discussingfilm new clip guardians galaxy vo...,"{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,,,


In [None]:
# Save the preprocessed data to a new file
output_filename = 'preprocessed__data'
df.to_csv(os.path.join(data_path, f'{output_filename}.csv'), index=False)

In [23]:
# Load the Twitter data
processed = pd.read_csv('/content/drive/MyDrive/preprocessed_data.csv')