# Model

### Read data

In [29]:
import numpy as np
import pandas as pd
import re
%matplotlib inline
import matplotlib.pyplot as plt

# Read original csv file
df0 = pd.read_csv('tweets_public2.csv', encoding='utf-16', index_col='tweet_id', sep=',')
#df0.count()

# Keep relevant columns only
df = df0.drop(['retweet_count','is_reply','reply_count','tweet_coord','tweet_location','user_timezone'], axis=1)
df.head()

Unnamed: 0_level_0,airline_sentiment,text,tweet_created,newairline_sentiment
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
942743012337123328,positive,"""Los pilotos de Ryanair desconvocan la huelga ...",Mon Dec 18 13:07:04 +0000 2017,positive
926857871916183553,positive,"@Iberia @lavecinarubia Si ,por favor las decla...",Sat Nov 04 17:05:11 +0000 2017,positive
936979305720090626,neutral,@Iberia Me dirías por favor que costo tiene?,Sat Dec 02 15:24:09 +0000 2017,neutral
943983853802328064,negative,"@SupermanlopezN @Iberia @giroditalia Champion,...",Thu Dec 21 23:17:43 +0000 2017,negative
938207464457211904,negative,@SrtaFarrellDM @KLM @Iberia Eso de avianca es ...,Wed Dec 06 00:44:25 +0000 2017,negative


### Extract time information

In [30]:
def get_time_info(df):
    # Time information transformed into hour of the day + day of the week one-hot encoding matrices
    tweet_created = pd.to_datetime(df['tweet_created'])
    hoursday = tweet_created.dt.hour
    weekday = tweet_created.dt.dayofweek

    # One-hot encoding for 'hoursday' and 'weekday'
    hour1hot = pd.get_dummies(hoursday,prefix='h',columns=list(range(24)))
    weekday1hot = pd.get_dummies(weekday,prefix='wd',columns=list(range(7)))
    return hour1hot

### Extract Emoji

In [31]:
# Extract emojis from text
from emoji import UNICODE_EMOJI #pip install emoji

def emoji_shortname(patterns,thelist):
    elist = []
    for pattern in patterns:
        for emoji in thelist:
            match = re.search(pattern,emoji)
            if match: elist.append(emoji)
    return elist

def get_emoji_img(mydict,emojiShortname):
    ## Extract unicode emoji images based on the selected shortnames
    return list(mydict.keys())[list(mydict.values()).index(emojiShortname)]

def in_emoji(tweet,emojilist):
    emo = 0
    inemoji = []
    for emoji in emojilist:
        match = tweet.find(emoji)
        if match != -1: inemoji.append(emoji)
    if(len(inemoji) >=1): emo = 1
    return emo 

def get_emoji(df):
    # List of UNICODE emojis
    unicodeemojilist = list(UNICODE_EMOJI.values())

    # Regular expressions to cast negative emoji in the list of Unicode Emoji
    negative_patterns = [re.compile(r'.*frown.*(face)'),re.compile(r'confounded'),re.compile(r'disappoint'),
                     re.compile(r'worried'),re.compile(r'crying'),re.compile(r'\bang[er][ry]'),re.compile(r'fear'),
                     re.compile(r'weary'),re.compile(r'exploding_face'),re.compile(r'grimacing'),
                     re.compile(r'face_with_steam_from_nose'),re.compile(r'pouting_face'),re.compile(r'sleepy_face'),
                     re.compile(r'downcast_face_with_sweat'),re.compile(r'unamused_face'),
                     re.compile(r'see-no-evil_monkey'),re.compile(r'pensive_face'),re.compile(r'persevering_face'),
                     re.compile(r'anxi'),re.compile(r'scream'),re.compile(r'hot_face'),re.compile(r'flushed'),
                     re.compile(r'zany_face'),re.compile(r'dizzy.*(face)'),re.compile(r'face_with_symbols_on_mouth'),
                     re.compile(r'thumbs_down:'),re.compile(r'middle_finger:'),re.compile(r'broken_heart')]
    # Regular expressions to cast positive emoji in the list of Unicode Emoji
    positive_patterns = [re.compile(r'grin'),re.compile(r'joy'),re.compile(r'smil'),re.compile(r'kiss:'),re.compile(r'wink'),
           re.compile(r'savoring_food'),re.compile(r'[^broken|couple_with]\wheart'),re.compile(r'thumbs_up:'),
           re.compile(r'OK_hand:'),re.compile(r'clapping_hands:'),re.compile(r'waving_hand:'),
           re.compile(r'raised_hand:'),re.compile(r':relieved_face')]    
                
    # List of negative/positive emojis shortnames
    negative_emoji_shortname = emoji_shortname(negative_patterns,unicodeemojilist)
    positive_emoji_shortname = emoji_shortname(positive_patterns,unicodeemojilist)

    # List of negative/positive emojis
    nlist = []
    for emoji in negative_emoji_shortname:
        nlist.append(get_emoji_img(UNICODE_EMOJI,emoji)) 
    plist = []
    for emoji in positive_emoji_shortname:
        plist.append(get_emoji_img(UNICODE_EMOJI,emoji)) 
    
    # Save the number of positive and negative emojis in two new columns
    emopos = df['text'].apply(in_emoji,emojilist=plist)
    emoneg = df['text'].apply(in_emoji,emojilist=nlist)
    
    emoji1hot = pd.DataFrame({'emopos':emopos,'emoneg':emoneg})
    #emoji1hot.emopos.sum()
    return emoji1hot

### Clean Text and Extract Airlines

In [32]:
import unidecode #pip install unidecode
import string

def basicCleaning(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Delete URLs www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Remove accents
    tweet = unidecode.unidecode(tweet)
    #Delete via, rt and by
    tweet = re.sub(r'\b(rt|via|by)\b','',tweet)
    # Remove numbers
    tweet = re.sub(r'\d+','',tweet)
    # Remove single characters
    tweet = re.sub(r'\b\w\b','',tweet)
    # Remove email addresses
    tweet = re.sub(r'\w*@\w*','',tweet)
    #Remove additional white spaces
    tweet = re.sub('\s+', ' ', tweet)
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
    return tweet

def moreCleaning(tweet):
    #Delete @usernames
    #tweet = re.sub('@[^\s]+','',tweet)
    #Delete #hashtags
    tweet = re.sub('#[^\s]+','',tweet)
    # Remove punctuation (includes @, \ and #)
    regex = re.compile('[%s]' % re.escape(string.punctuation+u"¡"+u"¿"+u"€"))
    tweet = re.sub(regex,'',tweet)
    #Remove additional white spaces
    tweet = re.sub('\s+', ' ', tweet)
    return tweet

def extract_airline(tweet):
    airlinesList = ['aena','aeromar','aeromexico','aireuropa','airmadrid','airnostrum','americanairlines',
         'avianca','blueair','britishairways','easyjet','emirates','iberia','klm',
         'lufthansa','niki','norwegian','qatar','ryanair','spanair','spiritairlines',
         'tame','vasp','vueling','westjet','wizzair']
    patterns = [re.compile(r'aena'),re.compile(r'aeromar?'),
            re.compile(r'aeromexi?c?o?'),re.compile(r'air\s?europ?a?'),
            re.compile(r'airmadr?i?d?'),re.compile(r'airnostru?m?'),
            re.compile(r'american\s?air?l?i?n?e?s?'),re.compile(r'avianca'),re.compile(r'blueai?r?'),
            re.compile(r'british\s?a?i?r?w?a?y?s?'),re.compile(r'easyjet'),
            re.compile(r'emitares'),
            re.compile(r'ibe?r?i?a?'),re.compile(r'klm'),
            re.compile(r'lufthansa'),re.compile(r'niki'),re.compile(r'norwegian'),
            re.compile(r'quatara?i?r?'),re.compile(r'ryanai?r?'),
            re.compile(r'spanai?r?'),re.compile(r'spiritairl?i?n?e?s?'),
            re.compile(r'tame'),re.compile(r'vasp'),
            re.compile(r'vueling'),re.compile(r'westjet'),re.compile(r'wizza?i?r?')]
      
    noms = str()
    i = 0
    for airline in patterns:
        match = re.search(airline, tweet)
        if match : noms = noms+'|'+airlinesList[i]
        i = i + 1
    if (len(noms) == 0): 
        noms = 'noairline'
    else:
        noms = noms[1:]
    return noms

def cleaning_and_airlines(df):
    airlinesList = ['aena','aeromar','aeromexico','aireuropa','airmadrid','airnostrum','americanairlines',
         'avianca','blueair','britishairways','easyjet','emirates','iberia','klm',
         'lufthansa','niki','noairline','norwegian','qatar','ryanair','spanair','spiritairlines',
         'tame','vasp','vueling','westjet','wizzair']
    textClean = df['text'].apply(basicCleaning)
    airline = textClean.apply(extract_airline)

    # From 'airline' column create a one-hot encoding matrix for airline name
    tweet_airline = airline.str.split(r'|', expand=True).stack().reset_index(level='tweet_id')
    tweet_airline.columns = ['tweet_id','airline']
    tweet_airline = tweet_airline.set_index('tweet_id')
    # One-hot encoding for airline name
    onehot = pd.get_dummies(tweet_airline['airline'],columns=airlinesList)
    #print(onehot.columns)
    airlines1hot = onehot.groupby('tweet_id').sum()
    #airlines1hot.sum() #Check the list of airlines extracted
    return airlines1hot

### Merge the three 1-hot-encoding dataframes : hour + airline + emoji¶

In [33]:
def get_1hot_hourAirlineEmoji(df):
    hour1hot = get_time_info(df)
    emoji1hot = get_emoji(df)
    airlines1hot = cleaning_and_airlines(df)
    #print(hour1hot.shape,emoji1hot.shape,airlines1hot.shape)

    # Concatenate the three 1-hot-encoding dataframes : hour + airline + emoji
    merged = pd.concat([hour1hot,airlines1hot,emoji1hot], axis=1, join_axes=[df.index])
    #merged = pd.concat([hour1hot,emoji1hot], axis=1, join_axes=[df.index])
    return merged

# Create corpus

### Remove stopwords and Lematization

In [34]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

def removeStopWords(tweet,stopWords):
    wordsFiltered = []    
    for word in tweet:
        if word not in stopWords:
            if re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None:
                wordsFiltered.append(word.strip())
    textFiltered = ' '.join(wordsFiltered)
    return textFiltered

def stemlem(tweet):
    stemmer = SnowballStemmer('spanish')
    lemmer = WordNetLemmatizer()

    wordsstemlem = []
    for word in tweet.split(' '):
        root = lemmer.lemmatize(stemmer.stem(word))
        wordsstemlem.append(root)  
        textstemlem = ' '.join(wordsstemlem)
    return textstemlem

def get_corpus(df):
    textClean1 = df['text'].apply(basicCleaning)
    textClean2 = textClean1.apply(moreCleaning)
    tokens = textClean2.apply(word_tokenize)

    # List of Spanish STOPWORDS
    stopWords = set(stopwords.words('spanish'))
    # Remove accents
    stopWordsNoAccents = [unidecode.unidecode(w) for w in stopWords]
    # Add extra airline vocabulary
    airlinesList = ['aena','aeromar','aeromexico','aireuropa','airmadrid','airnostrum','americanairlines',
     'avianca','blueair','britishairways','easyjet','emirates','iberia','klm',
     'lufthansa','niki','norwegian','qatar','ryanair','spanair','spiritairlines',
     'tame','vasp','vueling','westjet','wizzair']
    newstopWords = stopWordsNoAccents+airlinesList+['volar','aerolinea','destino','destinos','hacer','si','puede',
                                                    'favor','plus','click']

    textFiltered = tokens.apply(removeStopWords,stopWords=newstopWords)
    textStemLem = textFiltered.apply(stemlem)
    #df.tail()
    return textStemLem

### Model

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

extra_tomerge = get_1hot_hourAirlineEmoji(df)
textStemLem = get_corpus(df)
cv = CountVectorizer(min_df=1, ngram_range=(1,2), analyzer='word', token_pattern=r'\b\w+\b')
tf_matrix = cv.fit_transform(textStemLem)
voc = cv.vocabulary_
# Add to term frequency matrix the [hour, airlines and emoji] frequency matrix
all_matrix = np.column_stack((tf_matrix.toarray(),extra_tomerge.values))

x_train_counts,x_test_counts,y_train,y_test = train_test_split(all_matrix,df['newairline_sentiment'].values, test_size=0.15)


In [36]:
tfidf_transformer = TfidfTransformer()
x_train = tfidf_transformer.fit_transform(x_train_counts)
x_test = tfidf_transformer.transform(x_test_counts)

In [37]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

sgc = SGDClassifier(penalty = 'elasticnet',max_iter = 10)
sgc.fit(x_train,y_train)

# Predict new values for test
y_pred = sgc.predict(x_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.641868512111


In [38]:
x_alltrain = tfidf_transformer.fit_transform(all_matrix)
sgc.fit(x_alltrain,df['newairline_sentiment'].values)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

### For Kaggle

In [39]:
import datetime

def create_submit_file(df_submission, ypred):
    date = datetime.datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
    filename = 'submission_' + date + '.csv'

    df_submission['airline_sentiment'] = ypred
    df_submission[['airline_sentiment']].to_csv(filename)

    print('Submission file created: {}'.format(filename))
    print('Upload it to Kaggle InClass')

In [40]:
df3 = pd.read_csv('tweets_submission.csv', index_col='tweet_id', sep=',')
# Keep relevant columns only
df4 = df3.drop(['retweet_count','is_reply','reply_count','tweet_coord','tweet_location','user_timezone'], axis=1)
df4.head()

Unnamed: 0_level_0,text,tweet_created
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
942273491440631808,@Iberia Está clara vuestra política de rascar ...,Sun Dec 17 06:01:21 +0000 2017
943008475608682502,Iberia Plus cumple 25 años. Queremos celebrarl...,Tue Dec 19 06:41:55 +0000 2017
931226872729530368,"A ver, @Iberia, de verdad. Lo vuestro con el ...",Thu Nov 16 18:26:02 +0000 2017
933782711600283650,".@JavierJover, head of partnership and SMBs en...",Thu Nov 23 19:42:01 +0000 2017
941987812534038529,"Faro, Portugal 🇵🇹🤓✈️ https://t.co/Obzj8YlXTL",Sat Dec 16 11:06:10 +0000 2017


In [41]:
extra_tomerge = get_1hot_hourAirlineEmoji(df4)
textStemLem = get_corpus(df4)

cv = CountVectorizer(vocabulary=voc,min_df=1, ngram_range=(1,2), analyzer='word', token_pattern=r'\b\w+\b')
tf_matrix = cv.fit_transform(textStemLem)
x_test_counts = np.column_stack((tf_matrix.toarray(),extra_tomerge.values))
x_test = tfidf_transformer.transform(x_test_counts)

prediction_sgc = sgc.predict(x_test)

create_submit_file(df3, prediction_sgc)

Submission file created: submission_06_14_2018-10_48_35.csv
Upload it to Kaggle InClass


In [42]:
print(voc)


{'pilot': 25180, 'desconvoc': 9415, 'huelg': 16303, 'tras': 33051, 'ver': 34481, 'reconoc': 27994, 'sindicat': 30863, 'pais': 23813, 'ultim': 33478, 'notici': 22433, 'pilot desconvoc': 25197, 'desconvoc huelg': 9416, 'huelg tras': 16355, 'tras ver': 33074, 'ver reconoc': 34542, 'reconoc sindicat': 28003, 'sindicat pais': 30876, 'pais ultim': 23835, 'ultim notici': 33498, 'declar': 8932, 'amor': 1627, 'pued': 26832, 'qued': 27271, 'ma': 19314, 'declar amor': 8933, 'amor pued': 1632, 'pued qued': 26927, 'qued ma': 27295, 'diri': 10427, 'cost': 7878, 'diri cost': 10428, 'champion': 5782, 'vuel': 35649, 'descar': 9362, 'champion vuel': 5784, 'vuel descar': 35771, 'verd': 34578, 'mam': 19994, 'pa': 23474, 'do': 10620, 'vec': 34255, 'terribl': 32370, 'verd mam': 34599, 'mam pa': 20002, 'pa do': 23514, 'do vec': 10676, 'vec terribl': 34290, 'hol': 15805, 'conozc': 7347, 'numer': 22645, 'ayud': 3416, 'mail': 19630, 'codig': 6445, 'pin': 25251, 'graci': 14733, 'hol conozc': 15840, 'conozc numer