In [355]:
import numpy as np
import pandas as pd

import re
import unidecode
import nltk
from datetime import datetime
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer

In [356]:
df = pd.read_csv('tweets_public.csv', encoding='utf-16', index_col='tweet_id', sep=',')
df_sub = pd.read_csv('tweets_submission.csv')

In [357]:
print(df.columns)
print(df_sub.columns)

Index(['airline_sentiment', 'is_reply', 'reply_count', 'retweet_count', 'text',
       'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')
Index(['is_reply', 'reply_count', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')


In [358]:
df.text.values

array(['"Los pilotos de Ryanair desconvocan la huelga tras ver reconocidos sus sindicatos" by El País via Últimas noticias… https://t.co/80Fz6dxP9t',
       '@Iberia @lavecinarubia Si ,por favor las declaraciones de amor entre los  #rubijarena no  pueden quedarse en una ma… https://t.co/GWKJGhhubY',
       '@Iberia Me dirías por favor que costo tiene?', ...,
       'Compré vuelos con @British_Airways. El vuelo es operado por @Iberia. Llamo a BA para añadir una maleta y me dicen q… https://t.co/HSUhcKH6Ie',
       '@miguelitoelcon1 @Fjlopezm @Iberia Muchas gracias Miguel Ángel!',
       'Ryanair abrirá en 2018 cuatro nuevas rutas desde Valencia, Alicante, Sevilla y Girona https://t.co/RzsnG7svht'],
      dtype=object)

In [359]:
df.count()

airline_sentiment    7867
is_reply             7867
reply_count          7867
retweet_count        7867
text                 7867
tweet_coord            22
tweet_created        7867
tweet_location        439
user_timezone        5119
dtype: int64

In [360]:
def processTweet2(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub(r'@([^\s]+)',r'\1',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #tweet = re.sub(r'\#\w+','',tweet)
    return tweet  

def handle_emojis(tweet):
    tweet = re.sub(r'CC:', ' EMOPOS ', tweet)
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMOPOS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMOPOS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMOPOS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMOPOS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMONEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMONEG ', tweet)
    return tweet

def preprocess_word(word):
    # Remove punctuation
    #word = word.strip('\'"?!,.():;')
    word = word.strip('\'"?!,.')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    # remove numbers
    word = re.sub(r'\d+','',word)
    #remove users 
    word = re.sub(r'AT_USER','',word)
    word = re.sub(r'URL','',word)
    word = re.sub(r'rt','',word)
    word = re.sub(r'via','',word)
    word = re.sub(r'by','',word)
    return word

def remove_accents(word):
    word = unidecode.unidecode(word)
    return word

def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

In [361]:
tweets = [handle_emojis(tweet) for tweet in df.text.values]
tweets2 = [processTweet2(tweet) for tweet in tweets]
tweets3 = [' '.join(preprocess_word(remove_accents(word) )for word in tweet.split(' ')) for tweet in tweets2]
df.text = tweets3
df.text.values

array(['los pilotos de ryanair desconvocan la huelga tras ver reconocidos sus sindicatos  el pais  ultimas noticias ',
       'iberia lavecinarubia si por favor las declaraciones de amor entre los rubijarena no pueden quedarse en una ma ',
       'iberia me dirias por favor que costo tiene', ...,
       'compre vuelos con british_airways el vuelo es operado por iberia llamo a ba para anadir una maleta y me dicen q ',
       'miguelitoelcon fjlopezm iberia muchas gracias miguel angel',
       'ryanair abrira en  cuatro nuevas rutas desde valencia alicante sevilla y girona '],
      dtype=object)

In [362]:
tweets_sub = [handle_emojis(tweet) for tweet in df_sub.text.values]
tweets2_sub = [processTweet2(tweet) for tweet in tweets_sub]
tweets3_sub = [' '.join(preprocess_word(remove_accents(word) )for word in tweet.split(' ')) for tweet in tweets2_sub]
df_sub.text = tweets3_sub
df_sub.text.values

array(['iberia esta clara vuestra politica de rascar por todos lados lo q os digo es q en vuelos de EUR de clientes fie ',
       'iberia plus cumple  anos queremos celebrarlo contigo de una manera muy especial elige tu numero favorito y  ',
       'a ver iberia de verdad lo vuestro con el espacio entre asientos es exagerado mido una mierda de .m y no qu ',
       ...,
       'samsungespana iberia despues de anos con samsung note  sedge tablet gear y ni flores  creo que empezare a ',
       'mundo_ un pasajero de la aerolinea ryanair se hao de esperar y abrio la puea de emergencia para saltar al ala ',
       'ultima hora: ofea ryanair vuelos a  euros   vuelosaeuro'],
      dtype=object)

In [363]:
print(" ".join(nltk.stem.SnowballStemmer.languages))

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [364]:
sno = nltk.stem.SnowballStemmer('spanish')
tweet_1 = [' '.join(sno.stem(word) for word in tweet.split(' ')) for tweet in df.text.values]
lemma = nltk.wordnet.WordNetLemmatizer()
tweet_2 = [' '.join(lemma.lemmatize(word,pos='v') for word in tweet.split(' ')) for tweet in tweet_1]
df.text = [line.strip(' ') for line in tweet_2]
df.text.values

array(['los pilot de ryan desconvoc la huelg tras ver reconoc sus sindicat  el pais  ultim notici',
       'iberi lavecinarubi si por favor las declar de amor entre los rubijaren no pued qued en una ma',
       'iberi me diri por favor que cost tien', ...,
       'compr vuel con british_airways el vuel es oper por iberi llam a ba par anad una malet y me dic q',
       'miguelitoelcon fjlopezm iberi much graci miguel angel',
       'ryan abrir en  cuatr nuev rut desd valenci alic sevill y giron'],
      dtype=object)

In [365]:
sno = nltk.stem.SnowballStemmer('spanish')
tweet_1_sub = [' '.join(sno.stem(word) for word in tweet.split(' ')) for tweet in df_sub.text.values]
lemma = nltk.wordnet.WordNetLemmatizer()
tweet_2_sub = [' '.join(lemma.lemmatize(word,pos='v') for word in tweet.split(' ')) for tweet in tweet_1_sub]
df_sub.text = [line.strip(' ') for line in tweet_2_sub]
df_sub.text.values

array(['iberi esta clar vuestr polit de rasc por tod lad lo q os dig es q en vuel de eur de client fie',
       'iberi plus cumpl  anos quer celebr contig de una maner muy especial elig tu numer favorit y',
       'a ver iberi de verd lo vuestr con el espaci entre asient es exager mid una mierd de .m y no qu',
       ...,
       'samsungespan iberi despu de anos con samsung not  sedg tablet gear y ni flor  cre que empezar a',
       'mundo_ un pasajer de la aeroline ryan se hao de esper y abri la pue de emergent par salt al ala',
       'ultim hora: ofe ryan vuel a  eur   vuelosaeur'], dtype=object)

In [366]:
count_vect = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b')
X_train_counts = count_vect.fit_transform(df.text.values)
voc = count_vect.vocabulary_
print(X_train_counts.shape)
X_train_counts = X_train_counts.toarray()

(7867, 56687)


In [367]:
X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [368]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(7867, 56687)


In [369]:
def get_time_info(df):
    # Time information transformed into hour of the day + day of the week one-hot encoding matrices
    tweet_created = pd.to_datetime(df['tweet_created'])
    hoursday = tweet_created.dt.hour
    weekday = tweet_created.dt.dayofweek

    # One-hot encoding for 'hoursday' and 'weekday'
    hour1hot = pd.get_dummies(hoursday,prefix='h',columns=list(range(24)))
    weekday1hot = pd.get_dummies(weekday,prefix='wd',columns=list(range(7)))
    return hour1hot

# Extract emojis from text
from emoji import UNICODE_EMOJI #pip install emoji

def emoji_shortname(patterns,thelist):
    elist = []
    for pattern in patterns:
        for emoji in thelist:
            match = re.search(pattern,emoji)
            if match: elist.append(emoji)
    return elist

def get_emoji_img(mydict,emojiShortname):
    ## Extract unicode emoji images based on the selected shortnames
    return list(mydict.keys())[list(mydict.values()).index(emojiShortname)]

def in_emoji(tweet,emojilist):
    emo = 0
    inemoji = []
    for emoji in emojilist:
        match = tweet.find(emoji)
        if match != -1: inemoji.append(emoji)
    if(len(inemoji) >=1): emo = 1
    return emo 

def get_emoji(df):
    # List of UNICODE emojis
    unicodeemojilist = list(UNICODE_EMOJI.values())

    # Regular expressions to cast negative emoji in the list of Unicode Emoji
    negative_patterns = [re.compile(r'.*frown.*(face)'),re.compile(r'confounded'),re.compile(r'disappoint'),
                     re.compile(r'worried'),re.compile(r'crying'),re.compile(r'\bang[er][ry]'),re.compile(r'fear'),
                     re.compile(r'weary'),re.compile(r'exploding_face'),re.compile(r'grimacing'),
                     re.compile(r'face_with_steam_from_nose'),re.compile(r'pouting_face'),re.compile(r'sleepy_face'),
                     re.compile(r'downcast_face_with_sweat'),re.compile(r'unamused_face'),
                     re.compile(r'see-no-evil_monkey'),re.compile(r'pensive_face'),re.compile(r'persevering_face'),
                     re.compile(r'anxi'),re.compile(r'scream'),re.compile(r'hot_face'),re.compile(r'flushed'),
                     re.compile(r'zany_face'),re.compile(r'dizzy.*(face)'),re.compile(r'face_with_symbols_on_mouth'),
                     re.compile(r'thumbs_down:'),re.compile(r'middle_finger:'),re.compile(r'broken_heart')]
    # Regular expressions to cast positive emoji in the list of Unicode Emoji
    positive_patterns = [re.compile(r'grin'),re.compile(r'joy'),re.compile(r'smil'),re.compile(r'kiss:'),re.compile(r'wink'),
           re.compile(r'savoring_food'),re.compile(r'[^broken|couple_with]\wheart'),re.compile(r'thumbs_up:'),
           re.compile(r'OK_hand:'),re.compile(r'clapping_hands:'),re.compile(r'waving_hand:'),
           re.compile(r'raised_hand:'),re.compile(r':relieved_face')]    
                
    # List of negative/positive emojis shortnames
    negative_emoji_shortname = emoji_shortname(negative_patterns,unicodeemojilist)
    positive_emoji_shortname = emoji_shortname(positive_patterns,unicodeemojilist)

    # List of negative/positive emojis
    nlist = []
    for emoji in negative_emoji_shortname:
        nlist.append(get_emoji_img(UNICODE_EMOJI,emoji)) 
    plist = []
    for emoji in positive_emoji_shortname:
        plist.append(get_emoji_img(UNICODE_EMOJI,emoji)) 
    
    # Save the number of positive and negative emojis in two new columns
    emopos = df['text'].apply(in_emoji,emojilist=plist)
    emoneg = df['text'].apply(in_emoji,emojilist=nlist)
    
    emoji1hot = pd.DataFrame({'emopos':emopos,'emoneg':emoneg})
    #emoji1hot.emopos.sum()
    return emoji1hot

import unidecode #pip install unidecode
import string

def basicCleaning(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Delete URLs www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Remove accents
    tweet = unidecode.unidecode(tweet)
    #Delete via, rt and by
    tweet = re.sub(r'\b(rt|via|by)\b','',tweet)
    # Remove numbers
    tweet = re.sub(r'\d+','',tweet)
    # Remove single characters
    tweet = re.sub(r'\b\w\b','',tweet)
    # Remove email addresses
    tweet = re.sub(r'\w*@\w*','',tweet)
    #Remove additional white spaces
    tweet = re.sub('\s+', ' ', tweet)
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
    return tweet

def moreCleaning(tweet):
    #Delete @usernames
    #tweet = re.sub('@[^\s]+','',tweet)
    #Delete #hashtags
    tweet = re.sub('#[^\s]+','',tweet)
    # Remove punctuation (includes @, \ and #)
    regex = re.compile('[%s]' % re.escape(string.punctuation+u"¡"+u"¿"+u"€"))
    tweet = re.sub(regex,'',tweet)
    #Remove additional white spaces
    tweet = re.sub('\s+', ' ', tweet)
    return tweet

def extract_airline(tweet):
    airlinesList = ['aena','aeromar','aeromexico','aireuropa','airmadrid','airnostrum','americanairlines',
         'avianca','blueair','britishairways','easyjet','emirates','iberia','klm',
         'lufthansa','niki','norwegian','qatar','ryanair','spanair','spiritairlines',
         'tame','vasp','vueling','westjet','wizzair']
    patterns = [re.compile(r'aena'),re.compile(r'aeromar?'),
            re.compile(r'aeromexi?c?o?'),re.compile(r'air\s?europ?a?'),
            re.compile(r'airmadr?i?d?'),re.compile(r'airnostru?m?'),
            re.compile(r'american\s?air?l?i?n?e?s?'),re.compile(r'avianca'),re.compile(r'blueai?r?'),
            re.compile(r'british\s?a?i?r?w?a?y?s?'),re.compile(r'easyjet'),
            re.compile(r'emitares'),
            re.compile(r'ibe?r?i?a?'),re.compile(r'klm'),
            re.compile(r'lufthansa'),re.compile(r'niki'),re.compile(r'norwegian'),
            re.compile(r'quatara?i?r?'),re.compile(r'ryanai?r?'),
            re.compile(r'spanai?r?'),re.compile(r'spiritairl?i?n?e?s?'),
            re.compile(r'tame'),re.compile(r'vasp'),
            re.compile(r'vueling'),re.compile(r'westjet'),re.compile(r'wizza?i?r?')]
      
    noms = str()
    i = 0
    for airline in patterns:
        match = re.search(airline, tweet)
        if match : noms = noms+'|'+airlinesList[i]
        i = i + 1
    if (len(noms) == 0): 
        noms = 'noairline'
    else:
        noms = noms[1:]
    return noms

def cleaning_and_airlines(df):
    airlinesList = ['aena','aeromar','aeromexico','aireuropa','airmadrid','airnostrum','americanairlines',
         'avianca','blueair','britishairways','easyjet','emirates','iberia','klm',
         'lufthansa','niki','noairline','norwegian','qatar','ryanair','spanair','spiritairlines',
         'tame','vasp','vueling','westjet','wizzair']
    textClean = df['text'].apply(basicCleaning)
    airline = textClean.apply(extract_airline)

    # From 'airline' column create a one-hot encoding matrix for airline name
    tweet_airline = airline.str.split(r'|', expand=True).stack().reset_index(level='tweet_id')
    tweet_airline.columns = ['tweet_id','airline']
    tweet_airline = tweet_airline.set_index('tweet_id')
    # One-hot encoding for airline name
    onehot = pd.get_dummies(tweet_airline['airline'],columns=airlinesList)
    #print(onehot.columns)
    airlines1hot = onehot.groupby('tweet_id').sum()
    #airlines1hot.sum() #Check the list of airlines extracted
    return airlines1hot

def get_1hot_hourAirlineEmoji(df):
    hour1hot = get_time_info(df)
    emoji1hot = get_emoji(df)
    #airlines1hot = cleaning_and_airlines(df)
    #print(hour1hot.shape,emoji1hot.shape,airlines1hot.shape)

    # Concatenate the three 1-hot-encoding dataframes : hour + airline + emoji
    merged = pd.concat([hour1hot,emoji1hot], axis=1, join_axes=[df.index])
    #merged = pd.concat([hour1hot,emoji1hot], axis=1, join_axes=[df.index])
    return merged

In [370]:
extra_tomerge = get_1hot_hourAirlineEmoji(df)

In [371]:
# Add to term frequency matrix the [hour, airlines and emoji] frequency matrix
all_matrix = np.column_stack((X_train_tfidf.toarray(),extra_tomerge.values))

In [372]:
all_matrix.shape

(7867, 56713)

In [373]:
X_train_tfidf.toarray().shape

(7867, 56687)

In [374]:
extra_tomerge.shape

(7867, 26)

In [375]:
all_matrix.shape

(7867, 56713)

In [376]:
def is_reply(df):
    if df:
        return 1
    else:
        return 0

all_matrix_2 = np.c_[all_matrix,df['is_reply'].apply(is_reply).values]

In [377]:
all_matrix_2.shape

(7867, 56714)

In [387]:
features_train,features_test,label_train,label_test = \
train_test_split(all_matrix_2, df.airline_sentiment.values,test_size=0.15,random_state = 123)
sgc = SGDClassifier(penalty = 'elasticnet',max_iter = 10)
svm = LinearSVC()
sgc.fit(features_train,label_train)
svm.fit(features_train,label_train)
score = sgc.score(features_test,label_test)
scoresvm = svm.score(features_test,label_test)
print(score)
print(scoresvm)

0.6130397967823878
0.6062658763759525


In [388]:
sgc.fit(all_matrix_2, df.airline_sentiment.values)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [389]:
svm.fit(all_matrix_2,df.airline_sentiment.values)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [390]:
count_vect = CountVectorizer(vocabulary=voc,ngram_range=(1,2),token_pattern=r'\b\w+\b')
X_test_counts = count_vect.fit_transform(df_sub.text.values)

In [391]:
X_test_counts = X_test_counts.toarray()

In [392]:
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
extra_tomerge = get_1hot_hourAirlineEmoji(df_sub)
all_matrix = np.column_stack((X_test_tfidf.toarray(),extra_tomerge.values))
all_matrix_2 = np.c_[all_matrix,df_sub['is_reply'].apply(is_reply).values]
prediction = svm.predict(all_matrix_2)
prediction2 = sgc.predict(all_matrix_2)

In [393]:
def create_submit_file(df_submission, ypred):
    date = datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
    filename = 'submission_' + date + '.csv'

    df_submission['airline_sentiment'] = ypred
    df_submission[['tweet_id','airline_sentiment']].to_csv(filename,index_label = False,index = False)

    print('Submission file created: {}'.format(filename))
    print('Upload it to Kaggle InClass')
#prediction[prediction == 0] = "negative"
#prediction[prediction == 1] = "neutral"
#prediction[prediction == 2] = "positive"
create_submit_file(df_sub,prediction)
time.sleep(5)
create_submit_file(df_sub,prediction2)

Submission file created: submission_06_14_2018-16_24_21.csv
Upload it to Kaggle InClass
Submission file created: submission_06_14_2018-16_24_27.csv
Upload it to Kaggle InClass
