In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [25]:
data=pd.read_csv('/content/Tweets.csv')
print(data.shape)
data.head(3)

(14640, 12)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_created,latitude,longitude
0,5.703061e+17,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,35.888455,-119.273781
1,5.703011e+17,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,37.770971,-119.941025
2,5.703011e+17,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,35.684863,-119.709299


In [26]:
data['airline'].unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

In [27]:
data['airline_sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [28]:
data.isnull().sum()

tweet_id                           0
airline_sentiment                  0
airline_sentiment_confidence       0
negativereason                  5462
negativereason_confidence       4118
airline                            0
name                               0
retweet_count                      0
text                               0
tweet_created                      0
latitude                           0
longitude                          0
dtype: int64

In [29]:
df=data[['text','airline_sentiment']]
df.head(5)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [30]:
df=df[df.airline_sentiment!='neutral']
df.head()

Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [31]:
df.airline_sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [32]:
df['text'][4]

"@VirginAmerica and it's a really big bad thing about it"

In [33]:
import nltk
from nltk.corpus import stopwords

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [36]:
import re
data['text']=data['text'].apply(lambda x:re.sub('[^a-zA-z0-9\s]','',x))

In [41]:
print(data['text'].iloc[4])

VirginAmerica and its a really big bad thing about it


In [42]:
stop_word=set(stopwords.words('english'))

In [43]:
data['text']=data['text'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))

In [44]:
print(data['text'].iloc[3])

VirginAmerica really aggressive blast obnoxious entertainment guests faces amp little recourse


In [46]:
max_features=2000
tokenizer=Tokenizer(num_words=max_features,split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [47]:
data['text'].iloc[3]

'VirginAmerica really aggressive blast obnoxious entertainment guests faces amp little recourse'

In [48]:
seq=tokenizer.texts_to_sequences(data['text'].values)

In [52]:
print(seq[3])
print('length of sequence is {}'.format(len(seq[3])))

[31, 69, 932, 20, 425]
length of sequence is 5


In [56]:
seq=pad_sequences(seq)

In [57]:
seq[3]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,  31,  69, 932,  20,
       425], dtype=int32)

In [58]:
seq.shape

(14640, 27)

In [80]:
y=pd.get_dummies(data['airline_sentiment'],drop_first=True).values

In [81]:
x_train,x_test,y_train,y_test = train_test_split(seq,y,test_size=0.2)

In [82]:
print('x_train:',x_train.shape,'x_test:',x_test.shape)
print('y_train:',y_train.shape,'y_test:',y_test.shape)

x_train: (11712, 27) x_test: (2928, 27)
y_train: (11712, 2) y_test: (2928, 2)


In [62]:
embed_dimentions=128

In [76]:
model=Sequential()
model.add(Embedding(max_features,embed_dimentions,input_length=seq.shape[1]))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))



In [77]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 27, 128)           256000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 387,842
Trainable params: 387,842
Non-trainable params: 0
_________________________________________________________________


In [78]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [83]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=100,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fda85aac610>

In [122]:
tweets = ["virginamerica has the most incredible customer service i've ever experienced! so refreshing!"]


In [123]:
tokenizer.fit_on_texts(tweets)

In [124]:
filtered_words=[]
for word in tweets:
  if word not in stop_word:
    filtered_words.append(word)

print(filtered_words)

["virginamerica has the most incredible customer service i've ever experienced! so refreshing!"]


In [125]:
seq1=tokenizer.texts_to_sequences(filtered_words)

print(seq1)

[[31, 1201, 86, 1426, 1417, 15, 11, 90, 1014, 208]]


In [126]:
seq1=pad_sequences(seq1,maxlen=27)

In [127]:
y_pred=model.predict_classes(seq1)



In [128]:
y_pred

array([1])