In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
notstopwords = set(('not', 'can', 'no'))
stopwords = set(nltk.corpus.stopwords.words('english')) - notstopwords

standarizer_dict = {
    r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?": " <url> ",
    r'(.)\1+': r"\1\1", # cooool --> cool; coool--> cool
    r"\'s": "",
    r"\'n": "", 
    r"\'m": " am", 
    r"im": " ", 
    r"\'ve": " have", 
    r"\'ve": " have", 
    r" can\'t": " cannot", 
    r"n\'t": " not", 
    r"\'re": " are", 
    r"\'d": " would", 
    r"\'ll": " will", 
    r"\.{1,1}": " ", 
    r" [-+]?[.\d]*[\d]+[:,.\d]*": "",
    r"@\w+": r'  <entity> '
}

In [None]:
def preprocess_tweet(tweet):
    tweet.replace("\\n", " ")
    # Standarize tweet
    for current_form, standared_form in standarizer_dict.items():
        tweet = re.sub(current_form, standared_form, tweet)
    # Remove stop words
    tweet = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*').sub('', tweet)
    # Lemmatization
    tweet_tokens = []
    for token, tag in pos_tag(tokenizer.tokenize(tweet)):
        if tag[0].lower() in ['a','n','v']:
            lem = lemmatizer.lemmatize(token,tag[0].lower())
        else:
            lem = lemmatizer.lemmatize(token)
        
        tweet_tokens.append(lem.lower())
    return tweet_tokens

def preprocess_features(df):
    # Get labels (0,1,2,3) and polarity (sadness, joy, ..) 
    labels = sorted(set(df['class'].tolist()))
    polarity = sorted(set(df['polarity'].tolist()))
    one_hot = np.zeros((len(labels), len(labels)), int)
    # Create onehot encoding for labels and popularities
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))
    polarity_dict = dict(zip(polarity, one_hot))
    return label_dict, polarity_dict

def preprocess_dataframe(df):
    # Remove suffix of class : 0: no joy can be inferred -> 0
    df['text'] = df['text'].apply(lambda x: preprocess_tweet(x)).tolist()
    df['class'] = [c.split (":")[0] for c in df['class'].tolist()]
    return df

def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    df = df.iloc[np.random.permutation(len(df))] # Random permutations
    df = preprocess_dataframe(df)
    return df

def prepare_data(file_path):
    df = prepare_cvs_data(file_path)
    # Transform labels and polarities to onehot encoding
    label_dict, polarity_dict = preprocess_features(df)
    label = df['class'].apply(lambda y: label_dict[y]).tolist()
    polarity = df['polarity'].apply(lambda y: polarity_dict[y]).tolist()
    return df['text'], polarity, label
    

In [None]:
test_tweets, test_labels, test_polarities = prepare_data(data_test_path)
train_tweets, train_labels, train_polarities = prepare_data(data_train_path)
test_data = list(zip(test_tweets, test_polarities, test_labels))
train_data = list(zip(train_tweets, train_polarities, train_labels))

In [None]:
tweet = "😭😭 I \n think that you've a lot looool money ;) @Singaholic121 Good morning, love! Happy first day of fall. Let's make some awesome #autumnmemories #annabailey"
preprocess_tweet(tweet)