In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
from ftfy import fix_text
import re
import os
from gensim.parsing.preprocessing import remove_stopwords
from gensim import downloader as api
from gensim.models import FastText
import warnings
import logging

In [2]:
pd.options.display.max_colwidth = None
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
EMAIL_REGEX = re.compile(
    r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
    flags=re.IGNORECASE | re.UNICODE,
)

PHONE_REGEX = re.compile(
    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"
)

MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")

URL_REGEX = re.compile(
    r"(?:^|(?<![\w\/\.]))"
    # protocol identifier
    # r"(?:(?:https?|ftp)://)"  <-- alt?
    r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host name
    r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
    # domain name
    r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:\/[^\)\]\}\s]*)?",
    # r"(?:$|(?![\w?!+&\/\)]))",
    # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful?
    # But I made sure that it does not include ), ] and } in the URL.
    flags=re.UNICODE | re.IGNORECASE,
)

strange_double_quotes = [
    "«",
    "‹",
    "»",
    "›",
    "„",
    "“",
    "‟",
    "”",
    "❝",
    "❞",
    "❮",
    "❯",
    "〝",
    "〞",
    "〟",
    "＂",
]
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]

DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
SINGLE_QUOTE_REGEX = re.compile("|".join(strange_single_quotes))
HASHTAG_REGEX = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
MENTION_REGEX = re.compile("(?:^|\s)[＠ @]{1}([^\s#<>[\]|{}]+)", re.UNICODE)

In [4]:
def fix_strange_quotes(text):
    text = SINGLE_QUOTE_REGEX.sub("'", text)
    text = DOUBLE_QUOTE_REGEX.sub('"', text)
    return text

def normalize_whitespace(text):
    text = MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
    return text.strip()

def replace_urls(text, replace_with="<URL>"):
    return URL_REGEX.sub(replace_with, text)

def replace_emails(text, replace_with="<EMAIL>"):
    return EMAIL_REGEX.sub(replace_with, text)

def replace_phone_numbers(text, replace_with="<PHONE>"):
    return PHONE_REGEX.sub(replace_with, text)

def replace_hashtag(text, replace_with=''):
    return HASHTAG_REGEX.sub(replace_with, text)

def replace_mentions(text, replace_with=''):
    return MENTION_REGEX.sub(replace_with, text)

def clean_text(text):
    text = str(text)
    text = fix_text(text)
    text = fix_strange_quotes(text)
    text = replace_urls(text, replace_with='')
    text = replace_emails(text, replace_with='')
    text = replace_phone_numbers(text, replace_with='')
    text = replace_hashtag(text)
    text = replace_mentions(text)
    text = remove_stopwords(text)
    text = normalize_whitespace(text)
    return text.lower()

In [5]:
tweet_data_paths = [
    Path('../../full-corpus.csv'), 
    Path('../../TextEmotion.csv'), 
    Path('../data/test.csv'),
    Path('../data/train.csv')
]
data_columns = [
    'TweetText',
    'content',
    'text',
    'text'
]

In [6]:
processed_tweets = []
for i, data_path in enumerate(tqdm(tweet_data_paths)):
    df: pd.DataFrame = pd.read_csv(data_path)
    df.dropna(inplace=True)
    tqdm.pandas(desc=f'Cleaning {data_path.name}')
    df[data_columns[i]] = df[data_columns[i]].progress_apply(lambda x: clean_text(x))
    processed_tweets += df[data_columns[i]].tolist()
processed_tweets = [x.split() for x in processed_tweets]

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Cleaning full-corpus.csv', max=5113.0, style=ProgressStyl…




HBox(children=(FloatProgress(value=0.0, description='Cleaning TextEmotion.csv', max=40000.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Cleaning test.csv', max=3534.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Cleaning train.csv', max=21983.0, style=ProgressStyle(des…





In [7]:
print(processed_tweets[:5])

[['now', 'swype', 'iphone', 'crack.', 'iphone'], ['adding', 'carrier', 'support', 'iphone', '4s', '(just', 'announced)'], ['hilarious', 'video', '-', 'guy', 'duet', "'s", 'siri.', 'pretty', 'sums', 'love', 'affair!'], ['easy', 'switch', 'iphone.', 'see', 'ya!'], ['i', 'realized', 'reason', 'i', 'got', 'twitter', 'ios5', 'thanks']]


In [8]:
# train on text8
dataset = api.load('text8')
ft_model = FastText(sg=1, min_count=3, size=200, min_n=3, max_n=5, iter=10, workers=os.cpu_count(),
                   seed=42, max_vocab_size=100000, bucket=10000)
ft_model.build_vocab(dataset)
total_words = ft_model.corpus_total_words
ft_model.train(dataset, total_words=total_words, epochs=ft_model.epochs)

2020-06-06 15:42:27,297 : INFO : resetting layer weights
2020-06-06 15:42:27,351 : INFO : collecting all words and their counts
2020-06-06 15:42:27,354 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-06 15:42:28,168 : INFO : pruned out 0 tokens with count <=1 (before 100085, after 100085)
2020-06-06 15:42:28,200 : INFO : pruned out 46002 tokens with count <=2 (before 100302, after 54300)
2020-06-06 15:42:28,858 : INFO : pruned out 46902 tokens with count <=3 (before 100081, after 53179)
2020-06-06 15:42:29,718 : INFO : pruned out 48298 tokens with count <=4 (before 100011, after 51713)
2020-06-06 15:42:30,411 : INFO : pruned out 49635 tokens with count <=5 (before 100084, after 50449)
2020-06-06 15:42:31,353 : INFO : pruned out 50345 tokens with count <=6 (before 100150, after 49805)
2020-06-06 15:42:32,080 : INFO : pruned out 50651 tokens with count <=7 (before 100024, after 49373)
2020-06-06 15:42:32,299 : INFO : collected 67199 word types from a co

In [9]:
save_path = Path('../embeddings/fasttext/twitter/twitter_ft.model')

In [18]:
# train on twitter corpus
ft_model = FastText.load(str(save_path))
ft_model.build_vocab(processed_tweets, update=True)
total_words = ft_model.corpus_total_words
ft_model.train(processed_tweets, total_words=total_words, epochs=ft_model.epochs)

2020-06-06 15:57:43,928 : INFO : loading FastText object from ..\embeddings\fasttext\twitter\twitter_ft.model
2020-06-06 15:57:45,070 : INFO : loading wv recursively from ..\embeddings\fasttext\twitter\twitter_ft.model.wv.* with mmap=None
2020-06-06 15:57:45,070 : INFO : setting ignored attribute vectors_norm to None
2020-06-06 15:57:45,071 : INFO : setting ignored attribute vectors_vocab_norm to None
2020-06-06 15:57:45,071 : INFO : setting ignored attribute vectors_ngrams_norm to None
2020-06-06 15:57:45,072 : INFO : setting ignored attribute buckets_word to None
2020-06-06 15:57:45,072 : INFO : loading vocabulary recursively from ..\embeddings\fasttext\twitter\twitter_ft.model.vocabulary.* with mmap=None
2020-06-06 15:57:45,073 : INFO : loading trainables recursively from ..\embeddings\fasttext\twitter\twitter_ft.model.trainables.* with mmap=None
2020-06-06 15:57:45,073 : INFO : loaded ..\embeddings\fasttext\twitter\twitter_ft.model
2020-06-06 15:57:45,179 : INFO : collecting all wo

In [19]:
ft_model.save(str(save_path))

2020-06-06 15:58:11,225 : INFO : saving FastText object under ..\embeddings\fasttext\twitter\twitter_ft.model, separately None
2020-06-06 15:58:11,226 : INFO : storing np array 'vectors' to ..\embeddings\fasttext\twitter\twitter_ft.model.wv.vectors.npy
2020-06-06 15:58:11,264 : INFO : storing np array 'vectors_vocab' to ..\embeddings\fasttext\twitter\twitter_ft.model.wv.vectors_vocab.npy
2020-06-06 15:58:11,306 : INFO : not storing attribute vectors_norm
2020-06-06 15:58:11,307 : INFO : not storing attribute vectors_vocab_norm
2020-06-06 15:58:11,307 : INFO : not storing attribute vectors_ngrams_norm
2020-06-06 15:58:11,308 : INFO : not storing attribute buckets_word
2020-06-06 15:58:11,308 : INFO : storing np array 'syn1neg' to ..\embeddings\fasttext\twitter\twitter_ft.model.trainables.syn1neg.npy
2020-06-06 15:58:11,344 : INFO : storing np array 'vectors_vocab_lockf' to ..\embeddings\fasttext\twitter\twitter_ft.model.trainables.vectors_vocab_lockf.npy
2020-06-06 15:58:11,588 : INFO :

In [29]:
ft_model.wv.most_similar('runnn!!')

[('runners', 0.604508638381958),
 ('runny', 0.5834609270095825),
 ('runner', 0.5717713236808777),
 ('runnin', 0.5536907911300659),
 ('running', 0.5509806871414185),
 ('running!', 0.5410335063934326),
 ('run.', 0.5375546813011169),
 ('run', 0.53485506772995),
 ('videoconferencing', 0.5336881875991821),
 ('forerunners', 0.532096266746521)]