In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
#data = pd.read_csv('input/Sentiment.csv')
data = pd.read_csv('input/Tweets.csv')
# Keeping only the neccessary columns
#data = data[['text','sentiment']]
data = data[['text','airline_sentiment']]

In [12]:
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [13]:
# data_one.head()

In [27]:
data = data[data.airline_sentiment != "neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
# removing special chars
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
#
data.head()

Unnamed: 0,text,airline_sentiment
1,virginamerica plus youve added commercials to ...,positive
3,virginamerica its really aggressive to blast o...,negative
4,virginamerica and its a really big bad thing a...,negative
5,virginamerica seriously would pay 30 a flight ...,negative
6,virginamerica yes nearly every time i fly vx t...,positive


In [28]:
print(data[ data['airline_sentiment'] == 'positive'].size)
print(data[ data['airline_sentiment'] == 'negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt','')
data.head()

4726
18356


Unnamed: 0,text,airline_sentiment
1,virginamerica plus youve added commercials to ...,positive
3,virginamerica its really aggressive to blast o...,negative
4,virginamerica and its a really big bad thing a...,negative
5,virginamerica seriously would pay 30 a flight ...,negative
6,virginamerica yes nearly every time i fly vx t...,positive


In [29]:
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
X[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   99,  554,  490, 1244,    1,    2,  170],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
          65,  120,    1,  928,   15,   20,   59,   53,   25,  469]])

In [30]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 32, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
Y = pd.get_dummies(data['airline_sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(9232, 32) (9232, 2)
(2309, 32) (2309, 2)


In [32]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x209464027c8>

In [33]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)

In [34]:
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[1783   79]
 [ 105  342]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1862
           1       0.81      0.77      0.79       447

    accuracy                           0.92      2309
   macro avg       0.88      0.86      0.87      2309
weighted avg       0.92      0.92      0.92      2309



In [36]:
# Separate majority and minority classes
data_majority = data[data['airline_sentiment'] == 'negative']
data_minority = data[data['airline_sentiment'] == 'positive']

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then 
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [38]:
print('positive data in training:',(train.airline_sentiment == 'positive').sum())
print('negative data in training:',(train.airline_sentiment == 'negative').sum())
print('positive data in test:',(test.airline_sentiment == 'positive').sum())
print('negative data in test:',(test.airline_sentiment == 'negative').sum())


positive data in training: 1890
negative data in training: 7342
positive data in test: 473
negative data in test: 1836


In [41]:
# Separate majority and minority classes in training data for upsampling 
data_majority = train[train['airline_sentiment'] == 'negative']
data_minority = train[train['airline_sentiment'] == 'positive']

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
 
# Display new class counts
print("After upsampling\n",data_upsampled.airline_sentiment.value_counts(),sep = "")

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['text'].values)
X_train = pad_sequences(X_train,maxlen=29)
Y_train = pd.get_dummies(data_upsampled['airline_sentiment']).values
print('x_train shape:',X_train.shape)

X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences(X_test,maxlen=29)
Y_test = pd.get_dummies(test['airline_sentiment']).values
print("x_test shape", X_test.shape)

majority class before upsample: (7342, 2)
minority class before upsample: (1890, 2)
After upsampling
positive    7342
negative    7342
Name: airline_sentiment, dtype: int64
x_train shape: (14684, 29)
x_test shape (2309, 29)


In [42]:
# model
embed_dim = 128
lstm_out = 192

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 29, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 192)               246528    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 386       
Total params: 502,914
Trainable params: 502,914
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
batch_size = 128
# also adding weights
class_weights = {0: 1 ,
                1: 1.6/bias }
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1,
          class_weight=class_weights)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x209496d1d88>

In [44]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[1659  177]
 [  78  395]]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93      1836
           1       0.69      0.84      0.76       473

    accuracy                           0.89      2309
   macro avg       0.82      0.87      0.84      2309
weighted avg       0.90      0.89      0.89      2309



In [45]:
# running model to few more epochs
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1,
          class_weight=class_weights)
Y_pred = model.predict_classes(X_test,batch_size = batch_size)
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
confusion matrix [[1688  148]
 [  87  386]]
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      1836
           1       0.72      0.82      0.77       473

    accuracy                           0.90      2309
   macro avg       0.84      0.87      0.85      2309
weighted avg       0.90      0.90      0.90      2309



In [52]:
import tweepy
import csv

# Consumer keys and access tokens, used for OAuth
consumer_key = '****************'
consumer_secret = '*********************'
access_token = '***********************'
access_token_secret = '*************************'

# OAuth process, using the keys and tokens
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Creation of the actual interface, using authentication
api = tweepy.API(auth, wait_on_rate_limit=True)

all_twt = []
hashtag_phrase = '@IndiGo6E'
#get the name of the spreadsheet we will write to
fname = '_'.join(re.findall(r"#(\w+)", hashtag_phrase))

#open the spreadsheet we will write to
with open('%s.csv' % (fname), 'w', encoding="utf-8") as file:

    w = csv.writer(file)

    #write header row to spreadsheet
    w.writerow(['timestamp', 'tweet_text', 'username', 'all_hashtags', 'followers_count'])

    #for each tweet matching our hashtags, write relevant info to the spreadsheet
    for tweet in tweepy.Cursor(api.search, q=hashtag_phrase+' -filter:retweets', \
                                   lang="en", tweet_mode='extended').items():
        all_twt.append(tweet.full_text)
        w.writerow([tweet.created_at, tweet.full_text.replace('\n',' ').encode('utf-8'), tweet.user.screen_name.encode('utf-8'), [e['text'] for e in tweet._json['entities']['hashtags']], tweet.user.followers_count])

# all_twt = []
# for tweet in tweepy.Cursor(api.search, q='@IndiGo6E'+' -filter:retweets',lang="en", tweet_mode='extended').items():
#     #print(tweet.full_text)
#     all_twt.append(tweet.full_text)

In [53]:
print(len(all_twt))

193


In [61]:

#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(all_twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
print(len(twt))
cls = model.predict_classes(twt,batch_size = batch_size)
# for tt in twt:
#     print(tt)
#     sentiment = model.predict(tt,batch_size=1,verbose = 2)[0]
#     if(np.argmax(sentiment) == 0):
#         print("negative")
#         neg += 1
#     elif (np.argmax(sentiment) == 1):
#         print("positive")
#         pos += 1

193


In [64]:
cls

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [63]:
pos = 0
neg = 0
for i in cls:
    if i == 0:
        neg += 1
    else:
        pos += 1

print('total no of positive reviews are ', pos)

print('total no of negative reviews are ', neg)


total no of positive reviews are  38
total no of negative reviews are  155


In [68]:
twt = all_twt[0]
print(twt)
#vectorizing the tweet by the pre-fitted tokenizer instance
# twt = tokenizer.texts_to_sequences(twt)
# #padding the tweet to have exactly the same shape as `embedding_2` input
# twt = pad_sequences(twt, maxlen=29, dtype='int32', value=0)
# print(twt)
# sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
# if(np.argmax(sentiment) == 0):
#     print("negative")
# elif (np.argmax(sentiment) == 1):
#     print("positive")

@submariner_RSG @IndiGo6E I think indico is one of the uncultured air company. They are doing miss behaving with his passenger on routine basis. All indian media has reported these incidents. We should avoid .....to save our self respect and money also.
