In [1]:
import numpy as np
import pandas as pd

#!pip install wordcloud
#from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('AirlineTweets.csv')
data = data.sample(frac=1).reset_index(drop=True)
data = data[['airline_sentiment','text']]
print(data.shape)

data.head()

(14640, 2)


Unnamed: 0,airline_sentiment,text
0,negative,"@united sitting on the runway in Newark, they ..."
1,negative,@SouthwestAir the last 4 times I've arrived @L...
2,positive,@united Thanks for the reminder. It's been a f...
3,negative,@united when an airline causes the missed conn...
4,negative,@USAirways offloading the plane?!?!?! This is ...


In [3]:
# Data Pre-Processing

punctuation = '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'

def remove_handle(text, pattern):
    r = re.findall(pattern,text)
    for word in r:
        text = re.sub(word, '', text)
    return text


data['clean_text'] = data['text'].apply(lambda x: x.lower())
data['clean_text'] = np.vectorize(remove_handle)(data['clean_text'],'@[\w]*')
data['clean_text'] = data['clean_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
#data.head()
  

In [12]:
# Exploratory Data Analysis: Visualization of frequent words in tweets
all_words = ' '.join([tweet for tweet in data['clean_text']])
wordcloud = WordCloud(width=750, height=450, random_state=33, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

NameError: name 'WordCloud' is not defined

In [None]:
# Exploratory Data Analysis: visualization of frequent words in tweets labeled positive
all_words = ' '.join([tweet for tweet in data['clean_text'][data['airline_sentiment']=='positive']])
wordcloud = WordCloud(width=750, height=450, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Exploratory Data Analysis: visualization of frequent words in tweets labeled neutral
all_words = ' '.join([tweet for tweet in data['clean_text'][data['airline_sentiment']=='neutral']])
wordcloud = WordCloud(width=750, height=450, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Exploratory Data Analysis: visualization of frequent words in tweets labeled negative
all_words = ' '.join([tweet for tweet in data['clean_text'][data['airline_sentiment']=='negative']])
wordcloud = WordCloud(width=750, height=450, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [4]:
# Tokenization, Padding sequences
max_words = 4500
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(data['clean_text'].values) # creates an internal dictionary based on the tweets

x = tokenizer.texts_to_sequences(data['clean_text'].values) # replaces words in the tweets with corresponding int values
x = pad_sequences(x) # padding our text vector so they all have the same length
#len(x)
#print(x.shape)
#print(x.shape[1])


In [96]:
model = Sequential()
output_space_dim = 256
model.add(Embedding(input_dim=max_words, output_dim=output_space_dim, input_length=x.shape[1])) # input_lenth=32=number of words in a tweet
model.add(Dropout(rate=0.3)) # helps prevent overfitting, 30% input units are dropped (set to 0) during training time while the rest are scaled up so that the sum is unchanged
model.add(LSTM(units=output_space_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
model.add(LSTM(units=output_space_dim, dropout=0.3, recurrent_dropout=0.2))
model.add(Dense(units=3, activation='softmax')) # densely connected NN layer
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
print(model.summary())

Y = pd.get_dummies(data['airline_sentiment']).values #converts categorical data .i.e. labels to dummy variables
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

RNNmodel = model.fit(x_train, y_train, epochs=8, verbose=2) # default batch size = 32

model.save('sentiment_analysis.h5')

predictions = model.predict(x_test)

#[print(data['text'][i], predictions[i], y_test[i]) for i in range(0, 10)]

# pos_count, neu_count, neg_count = 0, 0, 0
# real_pos, real_neu, real_neg = 0, 0, 0
# for i, prediction in enumerate(predictions):
#     if np.argmax(prediction)==2:
#         pos_count += 1
#     elif np.argmax(prediction)==1:
#         neu_count += 1
#     else:
#         neg_count += 1
    
#     if np.argmax(y_test[i])==2:
#         real_pos += 1
#     elif np.argmax(y_test[i])==1:    
#         real_neu += 1
#     else:
#         real_neg +=1

# print('Positive predictions:', pos_count)
# print('Neutral predictions:', neu_count)
# print('Negative predictions:', neg_count)
# print('Real positive:', real_pos)
# print('Real neutral:', real_neu)
# print('Real negative:', real_neg)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 32, 256)           1152000   
                                                                 
 dropout_11 (Dropout)        (None, 32, 256)           0         
                                                                 
 lstm_22 (LSTM)              (None, 32, 256)           525312    
                                                                 
 lstm_23 (LSTM)              (None, 256)               525312    
                                                                 
 dense_11 (Dense)            (None, 3)                 771       
                                                                 
Total params: 2,203,395
Trainable params: 2,203,395
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/8
321/321 - 168s - loss: 0.671

In [17]:
plt.plot(RNNmodel.history['accuracy'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

NameError: name 'RNNmodel' is not defined

In [10]:
# Naive-Bayes Classifier

index = range(len(data['clean_text']))
#data.drop('text', axis=1, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(data,index,test_size=0.20,random_state=0)

train_tweets=[]
true_sentiments=[]        
word_features=[]
#stopwords = stopwords.words("english")

print("stopwords done")

def get_feature_vector(tweet):
    feature_vector = []
    words = tweet.split()
    for word in words:
        #replace two or more occurences of a word with two occurrences
        repetition = re.compile(r"(.)\1{1,}", re.DOTALL)
        word = repetition.sub(r"\1\1", word)
        #check if the word begins with a letter or number
        val = re.search(r"^[a-z][a-z0-9]*$", word)
        #ignore if it is a stop word
        #if(word in stopwords or val is None):
        if(val is None):
            continue
        else:
            feature_vector.append(word)
    return feature_vector

print("ah")

for i in range(len(x_train)):
    sentiment = x_train['airline_sentiment'][y_train[i]]
    tweet = x_train['clean_text'][y_train[i]]
    feature_vector = get_feature_vector(tweet)
    word_features.extend(feature_vector)
    train_tweets.append((feature_vector, sentiment))

    
print("feature vectors!")    

for i in range(len(x_test)):
    sentiment = x_test['airline_sentiment'][y_test[i]]
    true_sentiments.append((sentiment))
    
print("true sentiments!")
                             
# Extracts each feature from the tweet and stores it in a dictionary            
def extract_features(tweet):
    words = set(tweet)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in words)
    return features
word_features = list(set(word_features))

print("extraction done")
                             
training = nltk.classify.util.apply_features(extract_features, train_tweets)
classifier = nltk.NaiveBayesClassifier.train(training)
classifier.show_most_informative_features()
test = x_test['clean_text'].apply(lambda tweet:extract_features(get_feature_vector(tweet)))
test_sentiment = test.apply(lambda x: classifier.classify(x))

print("classified!")

# Finding overall accuracy and accuracy for positive, neutral, negative tweets each
accuracy = 0,
true_pos, true_neu, true_neg = 0,0,0
pos_accuracy, neu_accuracy, neg_accuracy = 0,0,0

for i in range(len(x_test)):
    
    if (test_sentiment[y_test[i]]==true_sentiments[i]):
        accuracy += 1
        if(test_sentiment[y_test[i]]=='positive'):
            true_pos += 1
        elif(test_sentiment[y_test[i]]=='neutral'):
            true_neu += 1
        else:
            true_neg = true_neg + 1
    
    if (true_sentiments[i]=='positive'):
        pos_accuracy += 1
    elif (true_sentiments[i]=='neutral'):
        neu_accuracy += 1
    else:
        neg_accuracy += 1         

accuracy = accuracy/float(len(x_test))
pos_precision = true_pos/float(pos_pre)
neu_precision = true_neu/float(neu_pre)
neg_precision = true_neg/float(neg_pre)

print ("Accuracy is " + str(accuracy))
print ("Positive precision is " + str(pos_precision))
print ("Neutral precision is " + str(neu_precision))
print ("Negative precision is " + str(neg_precision))


stopwords done
ah
feature vectors!
true sentiments!
extraction done
Most Informative Features
         contains(kudos) = True           positi : negati =     63.2 : 1.0
     contains(wonderful) = True           positi : negati =     50.3 : 1.0
      contains(favorite) = True           positi : negati =     32.3 : 1.0
      contains(passbook) = True           positi : negati =     32.3 : 1.0
       contains(helpful) = True           positi : neutra =     30.0 : 1.0
     contains(beautiful) = True           positi : negati =     29.7 : 1.0
   contains(outstanding) = True           positi : negati =     29.7 : 1.0
     contains(fantastic) = True           positi : negati =     27.1 : 1.0
           contains(lt3) = True           positi : negati =     27.1 : 1.0
        contains(prompt) = True           positi : negati =     27.1 : 1.0
classified!


TypeError: can only concatenate tuple (not "int") to tuple