In [1]:
import glob
from multiprocessing import Pool
import sys
import nltk
import os
import re
import codecs
import preprocessor as p
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import SpanishStemmer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# !pip install tweet-preprocessor

In [3]:
stop_words = set( (stopwords.words('english')) + (stopwords.words('spanish')) )

In [4]:
len(stop_words)

485

In [5]:
with open("data/train/english_train.text", 'r') as f:
    en_train_texts = [l.strip() for l in f]

with open('data/train/english_train.labels', 'r') as f:
    en_train_labels = [int(l.strip()) for l in f]

with open("data/test/english_test.text", 'r') as f:
    en_test_texts = [l.strip() for l in f]

with open('data/test/english_test.labels', 'r') as f:
    en_test_labels = [int(l.strip()) for l in f]

with open("data/train/spanish_train.text", 'r') as f:
    sp_train_texts = [l.strip() for l in f]

with open('data/train/spanish_train.labels', 'r') as f:
    sp_train_labels = [int(l.strip()) for l in f]

with open("data/test/spanish_test.text", 'r') as f:
    sp_test_texts = [l.strip() for l in f]

with open('data/test/spanish_test.labels', 'r') as f:
    sp_test_labels = [int(l.strip()) for l in f]

In [6]:
#tweeter preprocessor
p.set_options(p.OPT.URL, p.OPT.SMILEY, p.OPT.MENTION)

In [7]:
#Stemmer for English
get_stem_en = EnglishStemmer()

#prepare to remove punctuation
translator = str.maketrans("", "", punctuation)

In [8]:
def clean_text_en(texts):
    result = []
    for text in texts:
        #remove URL, Smiley, and @user
        text = p.clean(text)
        
        #make everything lower case
        text = text.lower()
        #remove stopwords
        text = ' '.join([i for i in text.split() if i not in stop_words])
        #remove punctuation
        text = text.translate(translator)
        #change every word to stem word
        text = [get_stem_en.stem(i) for i  in word_tokenize(text)]
        result.append(' '.join(text))
    
    return result

In [9]:
en_train_texts_cleaned = clean_text_en(en_train_texts)
en_test_texts_cleaned = clean_text_en(en_test_texts)

In [10]:
get_stem_sp = SpanishStemmer()
def clean_text_sp(texts):
    result = []
    for text in texts:
        #remove URL links, Smiley, and @user
        text = p.clean(text)
        
        #make everything lower case
        text = text.lower()
        #remove stopwords
        text = ' '.join([i for i in text.split() if i not in stop_words])
        #remove punctuation
        text = text.translate(translator)
        #change every word to stem word
        text = [get_stem_sp.stem(i) for i  in word_tokenize(text)]
        result.append(' '.join(text))
    
    return result

In [11]:
sp_train_texts_cleaned = clean_text_sp(sp_train_texts)
sp_test_texts_cleaned = clean_text_sp(sp_test_texts)

In [12]:
with codecs.open("en_train_texts_cleaned.txt",'w',"utf-8") as out_fs:
    for each in en_train_texts_cleaned:
        out_fs.write(each + "\n")

In [13]:
with codecs.open("en_test_texts_cleaned.txt",'w',"utf-8") as out_fs:
    for each in en_test_texts_cleaned:
        out_fs.write(each + "\n")

In [14]:
with codecs.open("sp_train_texts_cleaned.txt",'w',"utf-8") as out_fs:
    for each in sp_train_texts_cleaned:
        out_fs.write(each + "\n")

In [15]:
with codecs.open("sp_test_texts_cleaned.txt",'w',"utf-8") as out_fs:
    for each in sp_test_texts_cleaned:
        out_fs.write(each + "\n")

# English
## Logistic Regression

In [16]:
with open("en_train_texts_cleaned.txt", 'r') as f:
    en_train_texts_cleaned_read = [l.strip() for l in f]

In [17]:
with open("en_test_texts_cleaned.txt", 'r') as f:
    en_test_texts_cleaned_read = [l.strip() for l in f]

In [18]:
tf = TfidfVectorizer()
en_train_tf = tf.fit_transform(en_train_texts_cleaned_read)

In [19]:
LR = LogisticRegression()
LR.fit(en_train_tf, en_train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
en_test_tf = tf.transform(en_test_texts_cleaned_read)

In [21]:
en_pred = LR.predict(en_test_tf)

In [22]:
np.savetxt('en_predicted_labels_file.txt', en_pred, fmt='%d')

In [23]:
%run scorer_semeval18.py data/test/english_test.labels en_predicted_labels_file.txt

Macro F-Score (official): 19.946
-----
Micro F-Score: 32.7
Precision: 32.7
Recall: 32.7


### Try a bigramm tfidf vectorizer

In [24]:
tf2 = TfidfVectorizer(ngram_range=(-2,2))
en_train_tf = tf2.fit_transform(en_train_texts_cleaned_read)
en_test_tf = tf2.transform(en_test_texts_cleaned_read)

LR.fit(en_train_tf, en_train_labels)
en_pred = LR.predict(en_test_tf)
np.savetxt('en_predicted_labels_file.txt', en_pred, fmt='%d')
%run scorer_semeval18.py data/test/english_test.labels en_predicted_labels_file.txt



Macro F-Score (official): 17.631
-----
Micro F-Score: 32.09
Precision: 32.09
Recall: 32.09


### Try different parameter for Logistic Regression

In [25]:
en_train_tf = tf.fit_transform(en_train_texts_cleaned_read)
en_test_tf = tf.transform(en_test_texts_cleaned_read)

In [26]:
for c in range(2,11):
    print("C: " , c )
    LR = LogisticRegression(C=c)
    LR.fit(en_train_tf, en_train_labels)
    en_pred = LR.predict(en_test_tf)
    np.savetxt('predicted_labels_file.txt', en_pred, fmt='%d')
    np.savetxt('gold_labels_file.txt', np.array(en_test_labels), fmt='%s')
    %run scorer_semeval18.py gold_labels_file.txt predicted_labels_file.txt
    print()


C:  2




Macro F-Score (official): 21.004
-----
Micro F-Score: 32.75
Precision: 32.75
Recall: 32.75

C:  3
Macro F-Score (official): 21.374
-----
Micro F-Score: 32.2
Precision: 32.2
Recall: 32.2

C:  4
Macro F-Score (official): 21.728
-----
Micro F-Score: 31.97
Precision: 31.97
Recall: 31.97

C:  5
Macro F-Score (official): 21.748
-----
Micro F-Score: 31.54
Precision: 31.54
Recall: 31.54

C:  6
Macro F-Score (official): 21.708
-----
Micro F-Score: 31.14
Precision: 31.14
Recall: 31.14

C:  7
Macro F-Score (official): 21.663
-----
Micro F-Score: 30.86
Precision: 30.86
Recall: 30.86

C:  8
Macro F-Score (official): 21.519
-----
Micro F-Score: 30.57
Precision: 30.57
Recall: 30.57

C:  9
Macro F-Score (official): 21.456
-----
Micro F-Score: 30.31
Precision: 30.31
Recall: 30.31

C:  10
Macro F-Score (official): 21.328
-----
Micro F-Score: 30.03
Precision: 30.03
Recall: 30.03



The highest Macro F-Score occurs when C = 5, but when C = 2 all the scores have an improvment, then all other scores except macro F-score start to drop.

# Spanish Predition

In [27]:
with open("sp_train_texts_cleaned.txt", 'r') as f:
    sp_train_texts_cleaned_read = [l.strip() for l in f]

In [28]:
with open("sp_test_texts_cleaned.txt", 'r') as f:
    sp_test_texts_cleaned_read = [l.strip() for l in f]

In [29]:
sp_train_tf = tf2.fit_transform(sp_train_texts_cleaned_read)
sp_test_tf = tf2.transform(sp_test_texts_cleaned_read)

In [30]:
for c in range(1,11):
    print("C: " , c )
    LR = LogisticRegression(C=c)
    LR.fit(sp_train_tf, sp_train_labels)
    sp_pred = LR.predict(sp_test_tf)
    np.savetxt('predicted_labels_file.txt', sp_pred, fmt='%d')
    np.savetxt('gold_labels_file.txt', np.array(sp_test_labels), fmt='%s')
    %run scorer_semeval18.py gold_labels_file.txt predicted_labels_file.txt
    print()

C:  1




Macro F-Score (official): 7.685
-----
Micro F-Score: 29.2
Precision: 29.2
Recall: 29.2

C:  2
Macro F-Score (official): 9.126
-----
Micro F-Score: 29.5
Precision: 29.5
Recall: 29.5

C:  3
Macro F-Score (official): 11.244
-----
Micro F-Score: 30.1
Precision: 30.1
Recall: 30.1

C:  4
Macro F-Score (official): 11.432
-----
Micro F-Score: 29.8
Precision: 29.8
Recall: 29.8

C:  5
Macro F-Score (official): 12.116
-----
Micro F-Score: 29.7
Precision: 29.7
Recall: 29.7

C:  6
Macro F-Score (official): 12.16
-----
Micro F-Score: 29.4
Precision: 29.4
Recall: 29.4

C:  7
Macro F-Score (official): 12.745
-----
Micro F-Score: 29.7
Precision: 29.7
Recall: 29.7

C:  8
Macro F-Score (official): 12.749
-----
Micro F-Score: 29.4
Precision: 29.4
Recall: 29.4

C:  9
Macro F-Score (official): 12.904
-----
Micro F-Score: 29.5
Precision: 29.5
Recall: 29.5

C:  10
Macro F-Score (official): 12.905
-----
Micro F-Score: 29.6
Precision: 29.6
Recall: 29.6



## Try SVM

In [31]:
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC

In [32]:
svm = SVC(kernel="linear", C=5,random_state=14)

### English Predcition SVM

In [33]:
svm.fit(en_train_tf, en_train_labels)
en_pred_svm = svm.predict(en_test_tf)

In [34]:
np.savetxt('predicted_labels_file.txt', en_pred_svm, fmt='%d')
np.savetxt('gold_labels_file.txt', np.array(en_test_labels), fmt='%s')
%run scorer_semeval18.py gold_labels_file.txt predicted_labels_file.txt

Macro F-Score (official): 20.744
-----
Micro F-Score: 29.46
Precision: 29.46
Recall: 29.46


### Spanish Prediction SVM

In [35]:
svm.fit(sp_train_tf, sp_train_labels)
sp_pred_svm = svm.predict(sp_test_tf)

In [36]:
np.savetxt('predicted_labels_file.txt', sp_pred_svm, fmt='%d')
np.savetxt('gold_labels_file.txt', np.array(sp_test_labels), fmt='%s')
%run scorer_semeval18.py gold_labels_file.txt predicted_labels_file.txt

Macro F-Score (official): 12.612
-----
Micro F-Score: 28.8
Precision: 28.8
Recall: 28.8
