## Cleaning procedure
1. Punctuation removal
2. Removal of accented characters
3. Removal of URLs
4. Removal of special characters and digits
5. Contraction and case standardisation
6. Tokenization
7. Stop word removal
8. Lemmatization



In [None]:
!pip install -q contractions

[K     |████████████████████████████████| 287 kB 30.5 MB/s 
[K     |████████████████████████████████| 110 kB 55.2 MB/s 
[?25h

In [None]:
import pandas as pd
import nltk
import contractions
import unicodedata
import re
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
lemmatizer = WordNetLemmatizer()
tags = {
    'J': wordnet.ADJ,
    'V': wordnet.VERB,
    'N': wordnet.NOUN,
    'R': wordnet.ADV
}

def get_clean(text):
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")
    print(text)
    text = re.sub(r"https?:\S*", "", text)
    print(text)
    text = re.sub(r"[^a-zA-z0-9.,!?/:;\"\'\s]", "", text)
    print(text)
    text = "".join([char for char in text if char not in punctuation])
    print(text)
    text = " ".join([contractions.fix(word).lower() for word in text.split()])
    print(text)
    tokens = nltk.word_tokenize(text)
    text = [word for word in tokens if word not in stopwords]
    print(text)
    nltk_tagged = [(word, (tags[tag[0]] if tag[0] in tags else 'n')) for word, tag in nltk.pos_tag(text)]
    text = " ".join([lemmatizer.lemmatize(word, tag) for word, tag in nltk_tagged])
    print(text)
    return text


In [None]:
text = '''Britain's communications intelligence agency GCHQ has issued a statement denying it wiretapped Donald Trump during the
US Presidential campaign. See statement: https://www.samplelink.com/. '''
text2 = '''I don't know whether this link https://www.samplelink.com/ still works or not. I tried it yesterday and it did. '''
get_clean(text2)

I don't know whether this link https://www.samplelink.com/ still works or not. I tried it yesterday and it did. 
I don't know whether this link  still works or not. I tried it yesterday and it did. 
I don't know whether this link  still works or not. I tried it yesterday and it did. 
I dont know whether this link  still works or not I tried it yesterday and it did 
i do not know whether this link still works or not i tried it yesterday and it did
['know', 'whether', 'link', 'still', 'works', 'tried', 'yesterday']
know whether link still work try yesterday


'know whether link still work try yesterday'

In [None]:
df = pd.read_csv('drive/MyDrive/Sentiment Analysis/data/test.csv')

In [None]:
df["comment_text"] = df["comment_text"].apply(lambda x: get_clean(x))
df.to_csv("cleaned_test.csv")
!cp cleaned_test.csv "drive/My Drive"

## Get word embedding

In [None]:
!pip install -q "tensorflow_text==2.9.0"
!pip install -q tf-models-official==2.9.0

[K     |████████████████████████████████| 4.6 MB 6.3 MB/s 
[K     |████████████████████████████████| 2.0 MB 7.1 MB/s 
[K     |████████████████████████████████| 352 kB 75.9 MB/s 
[K     |████████████████████████████████| 118 kB 71.9 MB/s 
[K     |████████████████████████████████| 1.3 MB 56.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 64.9 MB/s 
[K     |████████████████████████████████| 238 kB 64.3 MB/s 
[K     |████████████████████████████████| 636 kB 70.7 MB/s 
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Sentiment Analysis/Sentiment Classifier/clean_data/cleaned_train.csv")
df = df[df["text"].notna()].reset_index(drop=True)

In [None]:
comments = df["text"]

(27401, 3)

In [None]:
BERT_MODEL = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1'
PREPROCESS_MODEL = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

preprocess = hub.load(PREPROCESS_MODEL)
bert = hub.load(BERT_MODEL)

n = 0
embeddings = np.empty((0,256))

while (n < comments.shape[0]):
    outputs = bert(preprocess(comments.iloc[n:min(n+1000, comments.shape[0])].values))["pooled_output"]
    if (n % 2000 == 0): print(f"Finish {n}")
    embeddings = np.append(embeddings, outputs.numpy(), axis=0)
    n += 1000

np.save("train_embedding.npy", embeddings)

Finish 0
Finish 2000
Finish 4000
Finish 6000
Finish 8000
Finish 10000
Finish 12000
Finish 14000
Finish 16000
Finish 18000
Finish 20000
Finish 22000
Finish 24000
Finish 26000


In [None]:
!cp train_embedding.npy "/content/drive/MyDrive/Sentiment Analysis/Sentiment Classifier/clean_data"

In [7]:
import numpy as np

# d = np.load("/content/drive/MyDrive/Sentiment Analysis/Toxic Comment Classifier/clean_data/train_embedding.npy")
np.round(d[2][:10],3)

array([ 0.307, -0.098, -0.33 ,  0.81 ,  0.11 ,  0.985, -0.998, -0.087,
        0.22 , -0.093])