# Dataset cleaning & pre-process

In [6]:
import numpy as np
import pandas as pd
import nltk.corpus

# Please delete this if you are not using google drive
from google.colab import drive
drive.mount('/content/drive')

# Please change this to your working directory
path="/content/drive/My Drive/2022NLP/project/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
closed_class_stop_words = ['a','the','an','and','or','but','about','above','after','along','amid','among',\
                           'as','at','by','for','from','in','into','like','minus','near','of','off','on',\
                           'onto','out','over','past','per','plus','since','till','to','under','until','up',\
                           'via','vs','with','that','can','cannot','could','may','might','must',\
                           'need','ought','shall','should','will','would','have','had','has','having','be',\
                           'is','am','are','was','were','being','been','get','gets','got','gotten',\
                           'getting','seem','seeming','seems','seemed',\
                           'enough', 'both', 'all', 'your' 'those', 'this', 'these', \
                           'their', 'the', 'that', 'some', 'our', 'no', 'neither', 'my',\
                           'its', 'his' 'her', 'every', 'either', 'each', 'any', 'another',\
                           'an', 'a', 'just', 'mere', 'such', 'merely' 'right', 'no', 'not',\
                           'only', 'sheer', 'even', 'especially', 'namely', 'as', 'more',\
                           'most', 'less' 'least', 'so', 'enough', 'too', 'pretty', 'quite',\
                           'rather', 'somewhat', 'sufficiently' 'same', 'different', 'such',\
                           'when', 'why', 'where', 'how', 'what', 'who', 'whom', 'which',\
                           'whether', 'why', 'whose', 'if', 'anybody', 'anyone', 'anyplace', \
                           'anything', 'anytime' 'anywhere', 'everybody', 'everyday',\
                           'everyone', 'everyplace', 'everything' 'everywhere', 'whatever',\
                           'whenever', 'whereever', 'whichever', 'whoever', 'whomever' 'he',\
                           'him', 'his', 'her', 'she', 'it', 'they', 'them', 'its', 'their','theirs',\
                           'you','your','yours','me','my','mine','I','we','us','much','and/or'
                           ]

In [8]:
# Start pre-processing

nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

def process_text(text):
  
    # Remove punctuation
    import string
    for punctuation in string.punctuation:
      if punctuation!=".":
        clean_text=text.replace(punctuation,' ')
    
    # tokenizing
    tokens = word_tokenize(clean_text)

    clean_tokens = []
    for token in tokens:
        if token.isalpha():
            clean_tokens.append(token)
   
    return clean_tokens

#Lemminization

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

def lemma(term):
  lemma_list=[]
  for token in term:
    lemma=wordnet_lemmatizer.lemmatize(token)
    if lemma not in closed_class_stop_words:
      lemma_list.append(lemma)
  return lemma_list

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Dataset 1: Yelp Review Rating Labeling Dataset

In [9]:
!pip install datasets
from datasets import load_dataset



In [10]:
yelp_rating_labled_dataset = load_dataset("yelp_review_full")

Downloading builder script:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full (download: 187.06 MiB, generated: 496.94 MiB, post-processed: Unknown size, total: 684.00 MiB) to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
yelp_rating_train = pd.DataFrame(yelp_rating_labled_dataset['train'])
yelp_rating_test = pd.DataFrame(yelp_rating_labled_dataset['test'])

In [12]:
yelp_rating_train_sample = yelp_rating_train.sample(65000)
yelp_rating_test_sample = yelp_rating_test.sample(5000)

In [13]:
yelp_rating_train_sample['cleaned_tokens'] = yelp_rating_train_sample['text'].apply(process_text).apply(lemma)
yelp_rating_test_sample['cleaned_tokens'] = yelp_rating_test_sample['text'].apply(process_text).apply(lemma)

In [14]:
yelp_rating_train_sample.head()

Unnamed: 0,label,text,cleaned_tokens
481860,0,Though I heard nothing but great recommendatio...,"[Though, heard, nothing, great, recommendation..."
207303,0,"I like dueling piano bars, don't get me wrong....","[dueling, piano, bar, do, wrong, BUT, IM, NEVE..."
375253,4,We had a terrific experience at this hotel. A...,"[We, terrific, experience, hotel, All, staff, ..."
612246,3,MURRAY SAWCHUCK at THE TROPICANA.\n\nI was cur...,"[MURRAY, SAWCHUCK, THE, wa, curious, see, type..."
55163,0,"We used to come here a lot, but then not long ...","[We, used, come, here, lot, then, long, ago, v..."


In [15]:
yelp_rating_test_sample.head()

Unnamed: 0,label,text,cleaned_tokens
36951,1,I was disappointed that the basics (like on an...,"[wa, disappointed, basic, syrup, procured, ven..."
12591,0,After they cancelled my reservation for no rea...,"[After, cancelled, reservation, reason, embarr..."
34948,2,fun to watch,"[fun, watch]"
956,1,Jillian's offered great football viewing for t...,"[Jillian, offered, great, football, viewing, t..."
34517,0,I went here with a few friends and I was expec...,"[went, here, few, friend, wa, expecting, great..."


## Dataset 2: Yelp Reviews Polarity-labled Dataset

The data file is 4GB for this one and is too large. Therefore I decide to convert the rating-labled data above to polarized data by mapping rating score of 1 and 2 to negative, 4 and 5 to positive.

In [32]:
yelp_rating_train_polarized = yelp_rating_train.sample(65000)
yelp_rating_test_polarized = yelp_rating_test.sample(5000)
polarization = {1: "Negative",
         0: "Negative",
         3: "Neutral",
         4: "Positive",
         3: "Postive"}
yelp_rating_train_polarized["polar"] = yelp_rating_train_polarized["label"].map(polarization)
yelp_rating_test_polarized["polar"] = yelp_rating_test_polarized["label"].map(polarization)

In [33]:
yelp_rating_train_polarized['cleaned_tokens'] = yelp_rating_train_polarized['text'].apply(process_text).apply(lemma)
yelp_rating_test_polarized['cleaned_tokens'] = yelp_rating_test_polarized['text'].apply(process_text).apply(lemma)

In [34]:
yelp_rating_train_polarized.head()
yelp_rating_test_polarized.groupby("label").size()

label
0    1037
1     982
2    1010
3     989
4     982
dtype: int64

## Dataset 3: Rotten Tomatoes Review Polarity Labeling

In [17]:
rotten_tomatoes_dataset = load_dataset("rotten_tomatoes")

Downloading builder script:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/921 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset rotten_tomatoes_movie_review/default (download: 476.34 KiB, generated: 1.28 MiB, post-processed: Unknown size, total: 1.75 MiB) to /root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes_movie_review downloaded and prepared to /root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
rotten_tomatoes_dataset_train = pd.DataFrame(rotten_tomatoes_dataset['train'])
rotten_tomatoes_dataset_test = pd.DataFrame(rotten_tomatoes_dataset['test'])

In [19]:
rotten_tomatoes_dataset_train['cleaned_tokens'] = rotten_tomatoes_dataset_train['text'].apply(process_text).apply(lemma)
rotten_tomatoes_dataset_test['cleaned_tokens'] = rotten_tomatoes_dataset_test['text'].apply(process_text).apply(lemma)

In [20]:
rotten_tomatoes_dataset_train.head()
rotten_tomatoes_dataset_test.head()

Unnamed: 0,text,label,cleaned_tokens
0,lovingly photographed in the manner of a golde...,1,"[lovingly, photographed, manner, golden, book,..."
1,consistently clever and suspenseful .,1,"[consistently, clever, suspenseful]"
2,"it's like a "" big chill "" reunion of the baade...",1,"[big, chill, reunion, gang, guy, harmless, pra..."
3,the story gives ample opportunity for large-sc...,1,"[story, give, ample, opportunity, action, susp..."
4,"red dragon "" never cuts corners .",1,"[red, dragon, never, cut, corner]"


## Dataset 4: Tweet Emoji Labeling

In [23]:
tweet_emoji_dataset = load_dataset("tweet_eval", "emoji")

Reusing dataset tweet_eval (/root/.cache/huggingface/datasets/tweet_eval/emoji/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
tweet_emoji_dataset_train = pd.DataFrame(tweet_emoji_dataset['train'])
tweet_emoji_dataset_test = pd.DataFrame(tweet_emoji_dataset['test'])

In [25]:
tweet_emoji_dataset_train['cleaned_tokens'] = tweet_emoji_dataset_train['text'].apply(process_text).apply(lemma)
tweet_emoji_dataset_test['cleaned_tokens'] = tweet_emoji_dataset_test['text'].apply(process_text).apply(lemma)

In [26]:
tweet_emoji_dataset_train.head()
tweet_emoji_dataset_test.head()

Unnamed: 0,text,label,cleaned_tokens
0,en Pelham Parkway,2,"[en, Pelham, Parkway]"
1,The calm before...... | w/ sofarsounds @user |...,10,"[The, calm, before, sofarsounds, user, B, Hall]"
2,Just witnessed the great solar eclipse @ Tampa...,6,"[Just, witnessed, great, solar, eclipse, Tampa..."
3,This little lady is 26 weeks pregnant today! E...,1,"[This, little, lady, week, pregnant, today, Ex..."
4,"Great road trip views! @ Shartlesville, Pennsy...",16,"[Great, road, trip, view, Shartlesville, Penns..."
