In [1]:
import numpy as np
import preprocessor as p
import re
import codecs
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import SpanishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from translate import Translator

In [2]:
# !pip install translate
# !pip install tweet-preprocessor

## Load Data

In [3]:
with open("data/train/english_train.text", 'r') as f:
    en_train_texts = [l.strip() for l in f]

with open('data/train/english_train.labels', 'r') as f:
    en_train_labels = [int(l.strip()) for l in f]

with open("data/test/english_test.text", 'r') as f:
    en_test_texts = [l.strip() for l in f]

with open('data/test/english_test.labels', 'r') as f:
    en_test_labels = [int(l.strip()) for l in f]

with open("data/train/spanish_train.text", 'r') as f:
    sp_train_texts = [l.strip() for l in f]

with open('data/train/spanish_train.labels', 'r') as f:
    sp_train_labels = [int(l.strip()) for l in f]

with open("data/test/spanish_test.text", 'r') as f:
    sp_test_texts = [l.strip() for l in f]

with open('data/test/spanish_test.labels', 'r') as f:
    sp_test_labels = [int(l.strip()) for l in f]
    
with open("data/mapping/english_mapping.txt", 'r') as f:
    en_mapping = [l.strip() for l in f]
    
with open("data/mapping/spanish_mapping.txt", 'r') as f:
    sp_mapping = [l.strip() for l in f]

### Remove emoji that only in English or Spanish

In [4]:
sp_remove_ls = [6,  7,  8,  14,  16,  17]
en_remove_ls = [4, 10, 12, 14, 15, 17, 18]

In [5]:
def remove_unmatched_label(text, label, remove_ls):
    all_index_to_remove = []
    for i in remove_ls:
        train_has_one_of_unmatched_index =  [index for index, value in enumerate(label) if value == i]
        all_index_to_remove += train_has_one_of_unmatched_index
    all_index_to_remove.sort()
    
    texts_removed = [text[i] for i in range(len(text)) if i not  in all_index_to_remove]
    labels_removed = [label[i] for i in range(len(label)) if i not  in all_index_to_remove]
    
    return texts_removed, labels_removed

In [6]:
en_train_texts_removed,  en_train_labels_removed = remove_unmatched_label(en_train_texts, en_train_labels,  en_remove_ls)
en_test_texts_removed,  en_test_labels_removed = remove_unmatched_label(en_test_texts, en_test_labels,  en_remove_ls)

In [7]:
sp_train_texts_removed,  sp_train_labels_removed = remove_unmatched_label(sp_train_texts, sp_train_labels,  sp_remove_ls)
sp_test_texts_removed,  sp_test_labels_removed = remove_unmatched_label(sp_test_texts, sp_test_labels,  sp_remove_ls)

Match Spanish emoji index to English index

In [8]:
def rematched_sp_labels(sp_labels):
    sp_labels_removed_rematched = []
    sp_labels_removed_rematched = [ 8 if  i == 11 else i for i in sp_labels]
    sp_labels_removed_rematched = [ 19 if  i == 13 else i for i in sp_labels_removed_rematched]
    sp_labels_removed_rematched = [ 7 if  i == 15 else i for i in sp_labels_removed_rematched]
    sp_labels_removed_rematched = [ 16 if  i == 18 else i for i in sp_labels_removed_rematched]
    sp_labels_removed_rematched = [ 13 if  i == 12 else i for i in sp_labels_removed_rematched]
    sp_labels_removed_rematched = [ 11 if  i == 9 else i for i in sp_labels_removed_rematched]
    sp_labels_removed_rematched = [ 9 if  i == 5 else i for i in sp_labels_removed_rematched]
    return sp_labels_removed_rematched

In [9]:
sp_train_labels_removed_rematched = rematched_sp_labels(sp_train_labels_removed)
sp_test_labels_removed_rematched = rematched_sp_labels(sp_test_labels_removed)

In [10]:
p.set_options(p.OPT.URL, p.OPT.SMILEY, p.OPT.MENTION)
translator = str.maketrans("", "", punctuation)
get_stem_en = EnglishStemmer()
stop_words = set( (stopwords.words('english')) + (stopwords.words('spanish')) )

In [11]:
def clean_text_en(texts):
    result = []
    for text in texts:
        #remove URL links, Smiley, and @user
        text = p.clean(text)
        
        #make everything lower case
        text = text.lower()
        #remove stopwords
        text = ' '.join([i for i in text.split() if i not in stop_words])
        #remove punctuation
        text = text.translate(translator)
        #change every word to stem word
        text = [get_stem_en.stem(i) for i  in word_tokenize(text)]
        result.append(' '.join(text))
    
    return result

In [29]:
en_train_texts_rm_cleaned = clean_text_en(en_train_texts_removed)
en_test_texts_rm_cleaned = clean_text_en(en_test_texts_removed)

In [30]:
with codecs.open("en_train_texts_removed_cleaned.txt",'w',"utf-8") as out_fs:
    for each in en_train_texts_rm_cleaned:
        out_fs.write(each + "\n")
        
    
with codecs.open("en_test_texts_removed_cleaned.txt",'w',"utf-8") as out_fs:
    for each in en_test_texts_rm_cleaned:
        out_fs.write(each + "\n")

In [14]:
get_stem_sp = SpanishStemmer()

In [15]:
def clean_text_sp(texts):
    result = []
    for text in texts:
        #remove URL links, Smiley, and @user
        text = p.clean(text)
        
        #make everything lower case
        text = text.lower()
        #remove stopwords
        text = ' '.join([i for i in text.split() if i not in stop_words])
        #remove punctuation
        text = text.translate(translator)
        #change every word to stem word
        text = [get_stem_sp.stem(i) for i  in word_tokenize(text)]
        result.append(' '.join(text))
    
    return result

In [16]:
sp_train_texts_rm_cleaned = clean_text_sp(sp_train_texts_removed)
sp_test_texts_rm_cleaned = clean_text_sp(sp_test_texts_removed)

In [17]:
with codecs.open("sp_train_texts_removed_cleaned.txt",'w',"utf-8") as out_fs:
    for each in sp_train_texts_rm_cleaned:
        out_fs.write(each + "\n")
        
with codecs.open("sp_test_texts_removed_cleaned.txt",'w',"utf-8") as out_fs:
    for each in sp_test_texts_rm_cleaned:
        out_fs.write(each + "\n")

# Spanish to English Translation

In [18]:
trans = str.maketrans("", "", punctuation)
sp_to_en_translator =  Translator(from_lang="spanish", to_lang="english")

In [19]:
def sp_to_en(texts):
    result = []

    for txt in texts:
         #remove URL links, Smiley, and @user
        txt = p.clean(txt)
        #remove punctuation
        txt = txt.translate(trans)
        
        #Spanish to English translation
        txt = sp_to_en_translator.translate(txt)
        #make everything lower case
        txt = txt.lower()
        #remove stopwords
        txt = ' '.join([i for i in txt.split() if i not in (stopwords.words('english'))])
        #change every word to stem word
        txt = [get_stem_en.stem(i) for i  in word_tokenize(txt)]
        result.append(' '.join(txt))
            
    return result

In [20]:
with open("sp_train_texts_removed_cleaned.txt", 'r') as f:
    sp_train_texts_rm_cleaned_read = [l.strip() for l in f]

with open("sp_test_texts_removed_cleaned.txt", 'r') as f:
    sp_test_texts_rm_cleaned_read = [l.strip() for l in f]

In [21]:
en_train_from_sp = sp_to_en(sp_train_texts_rm_cleaned_read)
en_test_from_sp = sp_to_en(sp_test_texts_rm_cleaned_read)

In [31]:
with open("en_train_texts_removed_cleaned.txt", 'r') as f:
    en_train_texts_rm_cleaned_read = [l.strip() for l in f]

with open("en_test_texts_removed_cleaned.txt", 'r') as f:
    en_test_texts_rm_cleaned_read = [l.strip() for l in f]

In [32]:
en_train_text_new = en_train_texts_rm_cleaned_read + en_train_from_sp
en_test_text_new = en_test_texts_rm_cleaned_read + en_test_from_sp

In [33]:
en_train_labels_new = en_train_labels_removed + sp_train_labels_removed_rematched
en_test_labels_new = en_test_labels_removed + sp_test_labels_removed_rematched

In [36]:
print(len(en_train_text_new))
print(len(en_test_text_new))
print(len(en_train_labels_new))
print(len(en_test_labels_new))

84817
8288
84817
8288


In [39]:
tf = TfidfVectorizer()
en_train_tf = tf.fit_transform(en_train_text_new)
en_test_tf = tf.transform(en_test_text_new)

In [40]:
LR = LogisticRegression(C=5)
LR.fit(en_train_tf, en_train_labels_new)
    
en_pred = LR.predict(en_test_tf)
    
np.savetxt('predicted_labels_file.txt', en_pred, fmt='%d')
np.savetxt('gold_labels_file.txt', np.array(en_test_labels_new), fmt='%d')
%run scorer_semeval18.py gold_labels_file.txt predicted_labels_file.txt
print()



Macro F-Score (official): 18.05
-----
Micro F-Score: 34.809
Precision: 34.809
Recall: 34.809



In [37]:
print(len(en_pred))
print(en_test_tf.shape)
print(len(en_test_text_new))
print(len(en_test_labels_new))

69835
(69835, 70662)
8288
8288
