## Exploration.

In [13]:
import pandas as pd
import re
import unicodedata
import numpy as np

In [6]:
train_df_raw = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')

In [11]:
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
def clean(string):
    unicode_string = unicodedata.normalize('NFKD', string).replace('\xa0', ' ')
    return _RE_COMBINE_WHITESPACE.sub(' ', unicode_string).strip()
train_df = train_df_raw.copy(deep=True)
train_df['text'] = train_df['text'].astype(str).apply(clean)
train_df['selected_text'] = train_df['selected_text'].astype(str).apply(clean)

We noticed that selected text on neutral strings tended to include all of the words from the original string. Was this more than a tendency? Apparently not

In [14]:
neutral_df = train_df.query('sentiment == "neutral"')
np.all(neutral_df['text'] == neutral_df['selected_text'])

False

In [15]:
neutral_df[neutral_df['text'].apply(len) - neutral_df['selected_text'].apply(len) > 10]

Unnamed: 0,textID,text,selected_text,sentiment
35,4f5267ad70,"Thats it, its the end. Tears for Fears vs Eric...","Thats it, its the end. Tears for Fears",neutral
57,6086b1f016,will be back later. http://plurk.com/p/rp3k7,will be back later.,neutral
110,9c4817f73b,if u have a friendster add me!!!!!!!!! my emai...,if u have a friendster add me!!!!!!!!! my emai...,neutral
198,931a866d3f,Had nicotine replacement patch on for 4 hours....,"So far, so good, but I did sleep for most of t...",neutral
199,7a718b23ef,_Sanderson What`s with Twatter lately? Either ...,What`s with Twatter lately? Either I can`t get...,neutral
...,...,...,...,...
27114,556d273874,http://tinyurl.com/cyonct vote for Rob,vote for Rob,neutral
27140,c28465b668,Google ... show me apples ... I only want to s...,Google ... show me apples ... I only want to s...,neutral
27301,6cf2428a33,I`m like so upset with you.. haha.. you never ...,I`m like so upset with you.. haha..,neutral
27332,3a9d357027,haha i see im so bored rite now.. it seems lik...,haha i see im so bored rite now.,neutral


Looks like links are often not included in the selected text

------------MISC------------

In [None]:
def jaccard(str1, str2, debug = False): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    if debug:
        print(a)
        print(b)
        print(c)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
LINK_WORDS = ['http', 'www', '.com']

def is_link(x):
    for link_word in LINK_WORDS:
        if link_word in x:
            return True
    return False

class Tokenizer:
    def __init__(self, bert_tokenizer):
        self.bert_tokenizer = bert_tokenizer
    
    def tokenize(self, string, strip_link = False):
        if strip_link:
            tokens = [x.lower() for x in string.split(' ')]
            tokens = [x for x in tokens if not is_link(x)]
            string = ' '.join(tokens)
        
        spaced_strings = string.split(' ')
        index_map = []
        all_tokens = []
        cur_sum = 0
        for string in spaced_strings:
            tokens = self.bert_tokenizer.tokenize(string)
            for token in tokens:
                index_map.append(cur_sum)
                cur_sum += len(token)
            cur_sum += 1
            all_tokens.extend(tokens)
        return all_tokens, index_map

tokenizer = Tokenizer(bert_tokenizer)