In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations



# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Read in the data
df = pd.read_csv('./twt_emotion_analysis/data.csv')

In [6]:
# Define the preprocessing function
def preprocess_tweet(tweet):
    # detect language
    # tweet = GoogleTranslator(source='auto', target='en').translate(tweet)
    
    # Remove Tweet number and RT
    tweet = re.sub(r'Tweet #\d+:', '', tweet)
    tweet = re.sub(r'RT', '', tweet)
    tweet = re.sub(r'ed', '', tweet)
    
    # Remove hashtags from the tweet
    tweet = re.sub(r'#\w+', '', tweet)
    
    # Remove mentions from the tweet
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove links/URLs from the tweet
    tweet = re.sub(r'http\S+', '', tweet)
    
    # remove repeated spaces, if any
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    # Remove special characters and numbers from string.punctuation
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    
    # Remove single characters from the start
    tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', tweet)
    
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the tokens back into a string
    tweet = ' '.join(tokens)
    
    return tweet
    

In [9]:
df['Processed_tweet'] = df['Tweets'].apply(preprocess_tweet)
#  drop preprocessed tweets with less than 3 words
df = df[df['Processed_tweet'].apply(lambda x: len(x.split()) > 3)]
#  drop preprocessed tweets null
df = df[df['Processed_tweet'].notna()]
# drop duplicates
df = df.drop_duplicates(subset=['Processed_tweet'])

save_path = './twt_emotion_analysis/data_cleaned.csv'
df.to_csv(save_path, index=False)