In [3]:
import re
import pandas as pd

In [4]:
df = pd.read_csv("./data/tweets_labelled.csv", delimiter=";")
df.head()

Unnamed: 0,id,created_at,text,sentiment
0,77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive
1,661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative
2,413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive
3,760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive
4,830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive


In [5]:
# keeping only important columns
df_drop = df.drop(["id", "created_at"], axis=1).copy()
df_drop.head()
df_drop = df_drop.dropna()

In [6]:
# cleaning text
def clean_text(text: str) -> str:
  # lowercase
  text = text.lower()

  # Remove URLs
  text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

  # Remove emojis and non-ASCII characters
  text = text.encode('ascii', 'ignore').decode('ascii')

  # Remove punctuation and special characters
  text = re.sub(r"[^\w\s]", '', text)

  # Remove extra white space
  text = re.sub(r'\s+', ' ', text).strip()

  return text


In [7]:
df_drop["cleaned_text"] = df["text"].apply(clean_text)

df_drop.head()

Unnamed: 0,text,sentiment,cleaned_text
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,rt robertbeadles yo enter to win 1000 monarch ...
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,srilanka surcharge on fuel removed the surchar...
2,Net issuance increases to fund fiscal programs...,positive,net issuance increases to fund fiscal programs...
3,RT @bentboolean: How much of Amazon's traffic ...,positive,rt bentboolean how much of amazons traffic is ...
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,amd ryzen 4000 desktop cpus looking great and ...


In [8]:
# converting labels into numbers

label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

df_drop['sentiment_encoded'] = df_drop['sentiment'].map(label_map)
df_drop.head()

Unnamed: 0,text,sentiment,cleaned_text,sentiment_encoded
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,rt robertbeadles yo enter to win 1000 monarch ...,2
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,srilanka surcharge on fuel removed the surchar...,0
2,Net issuance increases to fund fiscal programs...,positive,net issuance increases to fund fiscal programs...,2
3,RT @bentboolean: How much of Amazon's traffic ...,positive,rt bentboolean how much of amazons traffic is ...,2
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,amd ryzen 4000 desktop cpus looking great and ...,2


In [9]:
df_drop.to_csv('./data/tweets_post_process.csv', index=False, sep=";")