# Import librairies

In [16]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import re
import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS
import string
import torch

import random
import numpy as np

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import data

In [17]:
df = pd.read_csv("../data/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)

In [18]:
df = df.set_axis(["target","id","date","flag","user","text"], axis=1)


In [19]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [20]:
df = df.sample(100000)

# Pre-processing text

## Lowercase

In [21]:
df["text"] = df["text"].str.lower()

## Replace emojis with text

https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html

https://medium.com/geekculture/text-preprocessing-how-to-handle-emoji-emoticon-641bbfa6e9e7

In [22]:
# 'Emoji_Dict.p'- download link https://drive.google.com/open?id=1G1vIkkbqPBYPKHcQ8qy0G2zkoab2Qv4v


with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}



# def convert_emojis_to_word(text):
#     regex = re.compile("|".join(map(re.escape, Emoji_Dict.keys(  ))))
#     # For each match, look up the corresponding value in the dictionary
#     text = regex.sub(lambda match: " ".join(Emoji_Dict[match.group(0).replace("_"," ").replace(",","").replace(":","").split()]), text)
#     text = re.sub("\s\s+" , " ", text)
#     # for emot in Emoji_Dict:
#     #     text = re.sub(emot,r' '+emot, text)
#     #     text = re.sub("\s\s+" , " ", text)
#     #     text = re.sub(r'('+emot+')', " ".join(Emoji_Dict[emot].replace("_"," ").replace(",","").replace(":","").split()), text)
#     return text

def convert_emojis_to_word(text):
    # print(text)
    regex = re.compile("|".join(map(re.escape, Emoji_Dict.keys(  ))))
    # print(regex.sub(lambda match: Emoji_Dict[match.group(0)], text))
    # print(regex.sub(lambda match: Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," "), text))
    text = regex.sub(lambda match: Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," "), text)
    # text = regex.sub(lambda match: " ".join(Emoji_Dict[match.group(0)].replace("_"," ").replace(","," ").replace(":"," ").split()), text)
    text = re.sub("\s\s+" , " ", text)
    return text

df["text"] = df["text"].apply(convert_emojis_to_word)

## Replace abbreviations

In [23]:

abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    
    'shoulda': 'should have',
    'gonna': 'going to', 
    'wanna': 'wanting to',
    "ain't": "is not",
    "wana": 'wanting to',
   
    
    'ngl': 'not going to lie',
    'idk': 'i do not know',
    'fyi': 'for your information',
    'tbh': 'to be honest',
    'asap': 'as soon as possible',
    'bbiab': 'be back in a bit',
    'bbl': 'be back later',
    'bbs': 'be back soon',
    'bf': 'boyfriend',
    'bff': 'best friend forever',
    'brb': 'be right back',
    'cya': 'see you',
    'faq': 'frequently asked questions',
    'ftw': 'for the win',
    'g2g': 'got to go',
    'gf': 'girlfriend',
    'gr8': 'great',
    'hru': 'how are you',
    'ight': 'alright',
    'imo': 'in my opinion',
    'imy': 'i miss you',
    'irl': 'in real life',
    'istg': 'i swear',
    'lmao': 'laughing',
    'lmk': 'let me know',
    'lol': 'laughing',
    'nvd': 'nevermind',
    'noob': 'amateur',
    ' np ': ' no problem ',
    'ofc': 'of course',
    'omg': 'i can not believe it',
    'rn': 'right now',
    'ttyl': 'talk to you later',
    ' u ': ' you ',
    'wym': 'what do you mean ?',
    ' y ': ' why ',
    'yw': 'you are welcome'
    
}


def replace_abbreviations(text):
    regex = re.compile("|".join(map(re.escape, abbr_dict.keys(  ))))
    text = regex.sub(lambda match: abbr_dict[match.group(0)], text)
    return text
df["text"] = df["text"].apply(replace_abbreviations)

## Handle punctiation

In [24]:
punctiations = {}
for elem in string.punctuation:
    punctiations[elem] = f" {elem} "
punctiations["..."] = " ... "
def handle_punctiation(text):
    regex = re.compile("|".join(map(re.escape, punctiations.keys(  ))))
    text = regex.sub(lambda match: punctiations[match.group(0)], text)
    text = re.sub("\s\s+" , " ", text)
    return text
df["text"] = df["text"].apply(handle_punctiation)

## Remove stopwords

In [25]:
def filter_preprocess(x):
    # Delete links
    x = " ".join(filter(lambda y: not(y.startswith("http")), x.split()))
    x = " ".join(filter(lambda y: not(y.endswith(".com")), x.split()))
    
    # Remove stop words
    x = " ".join(filter(lambda y: not(y in stopwords.words('english')), x.split()))
    
    x = " ".join(filter(lambda y: y[0]!="@", x.split()))
    
    return x

df["text"] = df["text"].apply(filter_preprocess)

# Tokenize and encode words

In [26]:
df["text"] = df["text"].apply(word_tokenize)

In [27]:
# Get all different words
l = set(df["text"].sum())

In [28]:
# Define vocabulary
vocab = {k: v for v, k in enumerate(l)}

In [29]:
# Encode words thanks to vocab
def voc_encode(x):
    try:
        return vocab[x]
    except:
        return 0

def encode(x):
    return list(map(voc_encode, x))
df["text"] = df["text"].apply(encode)

# Handle label

In [31]:
df["target"].value_counts()

0    50066
4    49934
Name: target, dtype: int64

# Test

In [14]:
from nltk.stem.snowball import SnowballStemmer

englishStemmer=SnowballStemmer("english")

In [18]:
englishStemmer.keys()

AttributeError: 'SnowballStemmer' object has no attribute 'keys'

In [28]:
!pip3 install emot

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
[0m