In [1]:
import numpy as np
import pandas as pd

import re
import unidecode
import nltk
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

In [2]:
df = pd.read_csv('tweets_public.csv', encoding='utf-16', index_col='tweet_id', sep=',')
df_sub = pd.read_csv('tweets_submission.csv')

In [3]:
df.text.values

array(['"Los pilotos de Ryanair desconvocan la huelga tras ver reconocidos sus sindicatos" by El País via Últimas noticias… https://t.co/80Fz6dxP9t',
       '@Iberia @lavecinarubia Si ,por favor las declaraciones de amor entre los  #rubijarena no  pueden quedarse en una ma… https://t.co/GWKJGhhubY',
       '@Iberia Me dirías por favor que costo tiene?', ...,
       'Compré vuelos con @British_Airways. El vuelo es operado por @Iberia. Llamo a BA para añadir una maleta y me dicen q… https://t.co/HSUhcKH6Ie',
       '@miguelitoelcon1 @Fjlopezm @Iberia Muchas gracias Miguel Ángel!',
       'Ryanair abrirá en 2018 cuatro nuevas rutas desde Valencia, Alicante, Sevilla y Girona https://t.co/RzsnG7svht'],
      dtype=object)

In [4]:
df.count()

airline_sentiment    7867
is_reply             7867
reply_count          7867
retweet_count        7867
text                 7867
tweet_coord            22
tweet_created        7867
tweet_location        439
user_timezone        5119
dtype: int64

In [5]:
def processTweet2(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    #tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub(r'\#\w+','',tweet)
    return tweet  

def handle_emojis(tweet):
    tweet = re.sub(r'CC:', ' EMOPOS ', tweet)
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMOPOS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMOPOS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMOPOS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMOPOS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMONEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMONEG ', tweet)
    return tweet

def preprocess_word(word):
    # Remove punctuation
    #word = word.strip('\'"?!,.():;')
    word = word.strip('\'"?!,.')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    # remove numbers
    word = re.sub(r'\d+','',word)
    #remove users 
    word = re.sub(r'AT_USER','',word)
    word = re.sub(r'URL','',word)
    word = re.sub(r'rt','',word)
    word = re.sub(r'via','',word)
    word = re.sub(r'by','',word)
    return word

def remove_accents(word):
    word = unidecode.unidecode(word)
    return word

def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

In [6]:
tweets = [handle_emojis(tweet) for tweet in df.text.values]
tweets2 = [processTweet2(tweet) for tweet in tweets]
tweets3 = [' '.join(preprocess_word(remove_accents(word) )for word in tweet.split(' ')) for tweet in tweets2]
df.text = tweets3
df.text.values

array(['los pilotos de ryanair desconvocan la huelga tras ver reconocidos sus sindicatos  el pais  ultimas noticias ',
       '  si por favor las declaraciones de amor entre los  no pueden quedarse en una ma ',
       ' me dirias por favor que costo tiene', ...,
       'compre vuelos con  el vuelo es operado por  llamo a ba para anadir una maleta y me dicen q ',
       '   muchas gracias miguel angel',
       'ryanair abrira en  cuatro nuevas rutas desde valencia alicante sevilla y girona '],
      dtype=object)

In [7]:
tweets_sub = [handle_emojis(tweet) for tweet in df_sub.text.values]
tweets2_sub = [processTweet2(tweet) for tweet in tweets_sub]
tweets3_sub = [' '.join(preprocess_word(remove_accents(word) )for word in tweet.split(' ')) for tweet in tweets2_sub]
df_sub.text = tweets3_sub
df_sub.text.values

array([' esta clara vuestra politica de rascar por todos lados lo q os digo es q en vuelos de EUR de clientes fie ',
       'iberia plus cumple  anos queremos celebrarlo contigo de una manera muy especial elige tu numero favorito y  ',
       'a ver  de verdad lo vuestro con el espacio entre asientos es exagerado mido una mierda de .m y no qu ',
       ...,
       '  despues de anos con samsung note  sedge tablet gear y ni flores  creo que empezare a ',
       'mundo_ un pasajero de la aerolinea  se hao de esperar y abrio la puea de emergencia para saltar al ala ',
       'ultima hora: ofea ryanair vuelos a  euros   vuelosaeuro'],
      dtype=object)

In [8]:
print(" ".join(nltk.stem.SnowballStemmer.languages))

arabic danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [9]:
sno = nltk.stem.SnowballStemmer('spanish')
tweet_1 = [' '.join(sno.stem(word) for word in tweet.split(' ')) for tweet in df.text.values]
lemma = nltk.wordnet.WordNetLemmatizer()
tweet_2 = [' '.join(lemma.lemmatize(word,pos='v') for word in tweet.split(' ')) for tweet in tweet_1]
df.text = [line.strip(' ') for line in tweet_2]
df.text.values

array(['los pilot de ryan desconvoc la huelg tras ver reconoc sus sindicat  el pais  ultim notici',
       'si por favor las declar de amor entre los  no pued qued en una ma',
       'me diri por favor que cost tien', ...,
       'compr vuel con  el vuel es oper por  llam a ba par anad una malet y me dic q',
       'much graci miguel angel',
       'ryan abrir en  cuatr nuev rut desd valenci alic sevill y giron'],
      dtype=object)

In [10]:
sno = nltk.stem.SnowballStemmer('spanish')
tweet_1_sub = [' '.join(sno.stem(word) for word in tweet.split(' ')) for tweet in df_sub.text.values]
lemma = nltk.wordnet.WordNetLemmatizer()
tweet_2_sub = [' '.join(lemma.lemmatize(word,pos='v') for word in tweet.split(' ')) for tweet in tweet_1_sub]
df_sub.text = [line.strip(' ') for line in tweet_2_sub]
df_sub.text.values

array(['esta clar vuestr polit de rasc por tod lad lo q os dig es q en vuel de eur de client fie',
       'iberi plus cumpl  anos quer celebr contig de una maner muy especial elig tu numer favorit y',
       'a ver  de verd lo vuestr con el espaci entre asient es exager mid una mierd de .m y no qu',
       ...,
       'despu de anos con samsung not  sedg tablet gear y ni flor  cre que empezar a',
       'mundo_ un pasajer de la aeroline  se hao de esper y abri la pue de emergent par salt al ala',
       'ultim hora: ofe ryan vuel a  eur   vuelosaeur'], dtype=object)

In [11]:
count_vect = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b')
X_train_counts = count_vect.fit_transform(df.text.values)
voc = count_vect.vocabulary_
print(X_train_counts.shape)
X_train_counts = X_train_counts.toarray()

(7867, 48418)


In [12]:
X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(7867, 48418)


In [30]:
features_train,features_test,label_train,label_test = \
train_test_split(X_train_tfidf, df.airline_sentiment.values,test_size=0.15,random_state = 123)
sgc = SGDClassifier(penalty = 'elasticnet',max_iter = 10)
svm = LinearSVC()
sgc.fit(features_train,label_train)
svm.fit(features_train,label_train)
score = sgc.score(features_test,label_test)
scoresvm = svm.score(features_test,label_test)
print(score)
print(scoresvm)

0.619813717188823
0.6181202370872142


In [19]:
sgc.fit(X_train_tfidf, df.airline_sentiment.values)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=10, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [20]:
count_vect = CountVectorizer(vocabulary=voc,ngram_range=(1,2),token_pattern=r'\b\w+\b')
X_test_counts = count_vect.fit_transform(df_sub.text.values)

In [21]:
X_test_counts = X_test_counts.toarray()

In [22]:
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
prediction = svm.predict(X_test_tfidf)

In [23]:
def create_submit_file(df_submission, ypred):
    date = datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
    filename = 'submission_' + date + '.csv'

    df_submission['airline_sentiment'] = ypred
    df_submission[['tweet_id','airline_sentiment']].to_csv(filename,index_label = False,index = False)

    print('Submission file created: {}'.format(filename))
    print('Upload it to Kaggle InClass')
#prediction[prediction == 0] = "negative"
#prediction[prediction == 1] = "neutral"
#prediction[prediction == 2] = "positive"
create_submit_file(df_sub,prediction)

Submission file created: submission_06_05_2018-14_30_01.csv
Upload it to Kaggle InClass


In [24]:
pd.read_csv("submission_06_05_2018-14_30_01.csv")

Unnamed: 0,tweet_id,airline_sentiment
0,942273491440631808,negative
1,943008475608682502,neutral
2,931226872729530368,negative
3,933782711600283650,neutral
4,941987812534038529,neutral
5,935837733796368384,negative
6,926428166721343488,negative
7,926361816434905093,negative
8,937762322923380737,negative
9,946020437271695361,neutral
