<a href="https://colab.research.google.com/github/aflah02/NLP-Albumentations-Data-Augmentation/blob/main/NLPAlbumentations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import nltk
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
import random
import pandas as pd
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def shuffleSentenceTransform(paragraph):
  Sentences = list(nltk.tokenize.sent_tokenize(paragraph))
  random.shuffle(Sentences)
  return " ".join(Sentences)

In [3]:
def removeDuplicateSentences(paragraph):
  Sentences = list(nltk.tokenize.sent_tokenize(paragraph))
  unique = set()
  Sentences = [unique.add(n) or n for n in Sentences if n not in unique]
  return " ".join(Sentences)

In [4]:
def removeNumbers(sentence):
  """
  Reference: https://stackoverflow.com/a/817328/13858953
  """
  sentence = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b","",sentence)
  return sentence

In [39]:
def removeHashtags(sentence, removeWords):
  tk = TweetTokenizer()
  words = tk.tokenize(sentence)
  if (removeWords):
    noHashtagWords = []
    for i in range(len(words)):
      if words[i][0] == "#":
        continue
      noHashtagWords.append(words[i])
    return " ".join(noHashtagWords)                            
  else:
    noHashtags = []
    for i in range(len(words)):
      if words[i][0] == "#" and len(words[i]) > 1:
        noHashtags.append(words[i][1:])
        continue
      noHashtags.append(words[i])
    return " ".join(noHashtags) 

In [48]:
def removeUsers(sentence, keepNames):
  tk = TweetTokenizer()
  words = tk.tokenize(sentence)
  if (not keepNames):
    noUserNameWords = []
    for i in range(len(words)):
      if words[i][0] == "@":
        continue
      noUserNameWords.append(words[i])
    return " ".join(noUserNameWords)                            
  else:
    UserNameWords = []
    for i in range(len(words)):
      if words[i][0] == "@" and len(words[i]) > 1:
        UserNameWords.append(words[i][1:])
        continue
      UserNameWords.append(words[i])
    return " ".join(UserNameWords) 

In [58]:
def removeURLs(sentence):
  """
    regex taken from https://stackoverflow.com/a/28552670/13858953
  """
  URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
  return re.sub(URL_REGEX, "", sentence)

In [59]:
print(shuffleSentenceTransform("<Sentence1>. <Sentence2>. <Sentence4>. <Sentence4>. <Sentence5>. <Sentence5>."))
print(removeDuplicateSentences("<Sentence1>. <Sentence2>. <Sentence4>. <Sentence4>. <Sentence5>. <Sentence5>."))
print(removeNumbers("This must not b3 delet3d, but the number at the end yes -134.411"))
print(removeHashtags("this tweet is example #key1_key2_key3",False))
print(removeHashtags("this tweet is example #key1_key2_key3",True))
print(removeUsers("this tweet is example @Aflah_Gamer",False))
print(removeUsers("this tweet is example @Aflah_Gamer",True))
print(removeURLs("this tweet is example @Aflah_Gamer http://foo.com/more_(than)_one_(parens)"))

<Sentence4>. <Sentence2>. <Sentence5>. <Sentence4>. <Sentence1>. <Sentence5>.
<Sentence1>. <Sentence2>. <Sentence4>. <Sentence5>.
This must not b3 delet3d, but the number at the end yes
this tweet is example key1_key2_key3
this tweet is example
this tweet is example
this tweet is example Aflah_Gamer
this tweet is example @Aflah_Gamer 
