<a href="https://colab.research.google.com/github/carlosinator/cil-sentiment/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
pip3 install wordsegment

In [None]:
# colab auth
from google.colab import auth
auth.authenticate_user()

In [None]:
# copy data from google cloud storage
!gsutil cp "gs://cil_2023/train_pos_full.txt" .
!gsutil cp "gs://cil_2023/train_neg_full.txt" .
!gsutil cp "gs://cil_2023/test_data.txt" .

In [None]:
# general imports
import pandas as pd
import re
from pathlib import Path
from wordsegment import load, segment
load()

In [None]:
# base paths (overwrite maybe for local use or colab use)
BASE_PATH = Path()
BASE_OUT_PATH = Path()

# BASE_PATH = Path("./twitter-dataset/")
BASE_OUT_PATH = Path() / "prepro_output"

# ensure output path exists
BASE_OUT_PATH.mkdir(parents=True, exist_ok=True)

# define paths to raw data
TRAIN_POS_PATH = BASE_PATH / "train_pos_full.txt"
TRAIN_NEG_PATH = BASE_PATH / "train_neg_full.txt"
TEST_PATH = BASE_PATH / "test_data.txt"

# define paths to output data
TRAIN_POS_OUT_PATH = BASE_OUT_PATH / "train_pos_full_preprocessed_without_duplicates.txt"
TRAIN_NEG_OUT_PATH = BASE_OUT_PATH / "train_neg_full_preprocessed_without_duplicates.txt"
TEST_OUT_PATH = BASE_OUT_PATH / "test_data_preprocessed_without_duplicates.txt"

In [None]:
# read data
tweets_train_pos = pd.read_fwf(TRAIN_POS_PATH, sep='\n', header=None)[0].tolist()
tweets_train_neg = pd.read_fwf(TRAIN_NEG_PATH, sep='\n', header=None)[0].tolist()
tweets_test = pd.read_fwf(TEST_PATH, sep='\n', header=None)[0].tolist()

In [None]:
# remove duplicate strings from list and print out how many were removed
def remove_duplicates(tweets):
    print("------------ REMOVING DUPLICATES ------------")
    print("before removing duplicates: ", len(tweets))
    tweets = list(set(tweets))
    print("after removing duplicates: ", len(tweets))
    return tweets


# remove duplicates
tweets_train_pos = remove_duplicates(tweets_train_pos)
tweets_train_neg = remove_duplicates(tweets_train_neg)
tweets_test = remove_duplicates(tweets_test)

In [None]:
def unhashtag(tweet):
  offset = 0
  for h in re.finditer(r'#\S+', tweet):
    start, end = h.span()
    new_string = ' '.join(segment(h.group(0))) + ' '
    tweet = tweet[:(start+offset)] + new_string + tweet[(end+offset+1):]
    offset += len(new_string) - (end - start + 1)
  return tweet

In [None]:
unhashtag("bla bla #ihatethis bla bla")

In [None]:
# preprocessing

# should take about 5 min
for tweetlist in [tweets_train_pos, tweets_train_neg, tweets_test]:
  for i, tweet in enumerate(tweetlist):
    if '#' in tweet:
      tweetlist[i] = unhashtag(tweet)

In [None]:
# write preprocessed tweets back to file
for filename, tweetlist in [(TRAIN_POS_OUT_PATH, tweets_train_pos),
                            (TRAIN_NEG_OUT_PATH, tweets_train_neg),
                            (TEST_OUT_PATH, tweets_test)]:
  with open(filename, 'w') as f:
    for line in tweetlist:
      f.write(f"{line}\n")

In [None]:
# (optional) copy data to google cloud storage
!gsutil cp $TRAIN_POS_OUT_PATH "gs://cil_2023/"
!gsutil cp $TRAIN_NEG_OUT_PATH "gs://cil_2023/"
!gsutil cp $TEST_OUT_PATH "gs://cil_2023/"