In [1]:
## Se importan todas las librer√≠as que vamos a necesitar
'''Importing Libraries'''
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import preprocessor as p
from pywsd.utils import lemmatize_sentence
from sklearn import svm

Warming up PyWSD (takes ~10 secs)... took 24.899625062942505 secs.


In [2]:
# Se importa el dataset y lo guardamos en 'data'. Se guardan solamente el texto de los tweets en 'tweets'
##data= pd.read_csv('Downloads/Tweets.csv')
data= pd.read_csv('Tweets.csv')
tweets = data['text']
tweets.head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: text, dtype: object

In [3]:
# Se cambian las variables categ√≥ricas de la columna airline_sentiment a variables num√©ricas: 
## Negative : 0
## Neutral : 1
## Positive : 2

from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
data['target'] = lb.fit_transform(data['airline_sentiment'])
data[['airline_sentiment', 'target']]
target = data['target']
print(data[['airline_sentiment', 'target']])

      airline_sentiment  target
0               neutral       1
1              positive       2
2               neutral       1
3              negative       0
4              negative       0
5              negative       0
6              positive       2
7               neutral       1
8              positive       2
9              positive       2
10              neutral       1
11             positive       2
12             positive       2
13             positive       2
14             positive       2
15             negative       0
16             positive       2
17             negative       0
18             positive       2
19             positive       2
20             negative       0
21             positive       2
22             positive       2
23              neutral       1
24             negative       0
25             negative       0
26             negative       0
27              neutral       1
28             negative       0
29              neutral       1
...     

In [4]:
# Se comprueba la correlaci√≥n entre el tweet (text) y su etiqueta
df = data[['text', 'target']]
df.sample(4)

Unnamed: 0,text,target
6895,"@JetBlue FYI, I'm onboard #616 comfortably tra...",2
559,"@united No, I need you guys to not over book p...",0
769,@united good to know you will open the closed ...,0
10885,@USAirways I've been on hold for 35 mins to so...,0


In [5]:
# Se comprueba que el campo de tweets y el campo de target tiene la misma longitud.
print('N√∫mero de tweets: ' + ' ' + str(len(tweets)),'\nN√∫mero de targets:' + ' ' + str(len(target)))

N√∫mero de tweets:  14640 
N√∫mero de targets: 14640


In [6]:
# Se comprueba que codifica correctamente
# Al final de la tabla se observa la nueva variable (target)
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,target
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),1
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),2
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),1
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),0
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),0


In [7]:
# Se preprocesan los tweets con tweet-preprocessor5.0 y se eliminan solamente las menciones y los hashtags.
# Se guardan los tweets en un lista, la cual contiene todos los tweets sin hashtags ni menciones.
'''Preprocessing : tweet preprocessor 5.0'''
tweets_preprocessed = []
for tweet in tweets:
    p.set_options(p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.URL)
    tweets_preprocessed.append(p.clean(tweet))
    

In [8]:
tweets_preprocessed

['What said.',
 "plus you've added commercials to the experience... tacky.",
 "I didn't today... Must mean I need to take another trip!",
 'it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "and it's a really big bad thing about it",
 "seriously would pay $30 a flight for seats that didn't have this playing. it's really the only bad thing about flying VA",
 'yes, nearly every time I fly VX this ‚Äúear worm‚Äù won‚Äôt go away :)',
 'Really missed a prime opportunity for Men Without Hats parody, there.',
 "Well, I didn't‚Ä¶but NOW I DO! :-D",
 "it was amazing, and arrived an hour early. You're too good to me.",
 'did you know that suicide is the second leading cause of death among teens 10-24',
 'I &lt;3 pretty graphics. so much better than minimal iconography. :D',
 "This is such a great deal! Already thinking about my 2nd trip to &amp; I haven't even gone on my 1st trip yet! ;p",
 "I'm flying your skies again! U take al

In [9]:
# (MARTA !!!! ) Meter m√°s contracciones: https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
'''Preprocessing'''
tweets_cleaned=[]
for s in range(len(tweets_preprocessed)):
    tweet = re.sub(r"that's","that is", str(tweets_preprocessed[s]))
    tweet = re.sub(r"there's","there is",tweet)
    tweet = re.sub(r"you've","you have",tweet)
    tweet = re.sub(r"I've","I have",tweet)
    tweet = re.sub(r"they've","they have",tweet)
    tweet = re.sub(r"we've","we have",tweet)
    tweet = re.sub(r"there're","there are",tweet)
    tweet = re.sub(r"what's","what is",tweet)
    tweet = re.sub(r"where's","where is",tweet)
    tweet = re.sub(r"it's","it is",tweet)
    tweet = re.sub(r"who's","who is",tweet)
    tweet = re.sub(r"i'm","i am",tweet)
    tweet = re.sub(r"they're","they are",tweet)
    tweet = re.sub(r"she's","she is",tweet)
    tweet = re.sub(r"he's","he is",tweet)
    tweet = re.sub(r"it's","it is",tweet)
    tweet = re.sub(r"they're","they are",tweet)
    tweet = re.sub(r"who're","who are",tweet)
    tweet = re.sub(r"ain't","am not",tweet)
    tweet = re.sub(r"wouldn't","would not",tweet)
    tweet = re.sub(r"shouldn't","should not",tweet)
    tweet = re.sub(r"can't","can not",tweet)
    tweet = re.sub(r"couldn't","could not",tweet)
    tweet = re.sub(r"we'll","we will",tweet)
    tweet = re.sub(r"I'll","I will",tweet)
    tweet = re.sub(r"you'll","you will",tweet)
    tweet = re.sub(r"she'll","she will",tweet)
    tweet = re.sub(r"he'll","he will",tweet)
    tweet = re.sub(r"it'll","it will",tweet)
    tweet = re.sub(r"won't","will not",tweet)
    tweet = re.sub(r"could've","could have",tweet)
    tweet = re.sub(r"couldn't've","could not have",tweet)
    tweet = re.sub(r"didn't","did not",tweet)
    tweet = re.sub(r"don't","do not",tweet)
    tweet = re.sub(r"doesn't","does not",tweet)
    tweet = re.sub(r"everyone's","everyone is",tweet)
    tweet = re.sub(r"giv‚Äôn","given",tweet)
    tweet = re.sub(r"hasn't","has not",tweet)
    tweet = re.sub(r"haven't","have not",tweet)
    tweet = re.sub(r"hadn't","had not",tweet)
    tweet = re.sub(r"who're","who are",tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\d', ' ', tweet)
    tweet = tweet.lower()
    tweet = re.sub(r'^br$', ' ', tweet)
    tweet = re.sub(r'^[a-zA-Z]\s', ' ', tweet)
    tweet = re.sub(r'\s+br\s+',' ',tweet)
    tweet = re.sub(r'\s+[a-z]\s+', ' ',tweet)
    tweet = re.sub(r'^b\s+', '', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    tweet = tweet.split()
    tweet = ' '.join(tweet)
    tweets_cleaned.append(tweet) 

In [10]:
tweets_cleaned_2=[]
for sentence in tweets_cleaned:
    lem = lemmatize_sentence(sentence)
    lem = ' '.join(lem)
    tweets_cleaned_2.append(lem)

In [11]:
tweets_cleaned

['what said',
 'plus you have added commercials to the experience tacky',
 'did not today must mean need to take another trip',
 'it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse',
 'and it is really big bad thing about it',
 'seriously would pay flight for seats that did not have this playing it is really the only bad thing about flying va',
 'yes nearly every time fly vx this ear worm won go away',
 'really missed prime opportunity for men without hats parody there',
 'well did not but now do d',
 'it was amazing and arrived an hour early you re too good to me',
 'did you know that suicide is the second leading cause of death among teens',
 'lt pretty graphics so much better than minimal iconography d',
 'this is such great deal already thinking about my nd trip to amp have not even gone on my st trip yet p',
 'flying your skies again take all the away from travel',
 'thanks',
 'sfo pdx schedule is still mia',
 'so excited fo

In [12]:
tweets_cleaned_2

['what say',
 'plus you have add commercial to the experience tacky',
 'do not today must mean need to take another trip',
 'it be really aggressive to blast obnoxious entertainment in your guest face amp they have little recourse',
 'and it be really big bad thing about it',
 'seriously would pay flight for seat that do not have this playing it be really the only bad thing about fly va',
 'yes nearly every time fly vx this ear worm win go away',
 'really missed prime opportunity for men without hat parody there',
 'well do not but now do d',
 'it be amaze and arrive an hour early you re too good to me',
 'do you know that suicide be the second leading cause of death among teen',
 'lt pretty graphic so much good than minimal iconography d',
 'this be such great deal already think about my nd trip to amp have not even go on my st trip yet p',
 'fly your sky again take all the away from travel',
 'thanks',
 'sfo pdx schedule be still mia',
 'so excited for my first cross country flight l

In [13]:
d = {'target': target, 'tweet': tweets_cleaned_2}
target_tweet = pd.DataFrame(data=d)

In [14]:
target_tweet

Unnamed: 0,target,tweet
0,1,what say
1,2,plus you have add commercial to the experience...
2,1,do not today must mean need to take another trip
3,0,it be really aggressive to blast obnoxious ent...
4,0,and it be really big bad thing about it
5,0,seriously would pay flight for seat that do no...
6,2,yes nearly every time fly vx this ear worm win...
7,1,really missed prime opportunity for men withou...
8,2,well do not but now do d
9,2,it be amaze and arrive an hour early you re to...
