### Twitter Sentiment Analysis


In [1]:
# importing the libraries
# data manipulation
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
# importing the dataset
data = pd.read_csv('dataset.csv')

In [3]:
# viewing the first 5 rows
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/2015 11:15,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/2015 11:14,,Pacific Time (US & Canada)


### Cleaning the texts

In [4]:
data.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
import re
# importing nltk libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


In [6]:
lemm = WordNetLemmatizer()

In [7]:
corpus = []
n = len(data['text'])

In [8]:
for i in range(0,n):
    review = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    review = review.lower()
    review = review.split()
    stopword = stopwords.words('english')
    stopword.remove('not')
    review = [lemm.lemmatize(word) for word in review if word not in stopword]
    review = ' '.join(review)
    corpus.append(review)

### Bag of Words Model

In [9]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()
y = data["airline_sentiment"].values

### Splitting the data


In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [11]:
len(x_train)

11712

In [12]:
len(x_test)

2928

### Model

In [13]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

# fitting the data
model.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
# predicitng the resukts
y_pred = model.predict(x_test)
print(y_pred,y_test)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'negative'] ['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


In [15]:
# confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[873 372 625]
 [120 210 284]
 [ 81  58 305]]


0.47404371584699456

In [22]:
# predicting if the review is positive, negative or neutral
def reviews(new_review):
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower()
    new_review = new_review.split()
    stopword = stopwords.words('english')
    stopword.remove('not')
    new_review = [lemm.lemmatize(word) for word in new_review if word not in stopword]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    new_x_test = cv.transform(new_corpus).toarray()
    new_y_pred = model.predict(new_x_test)
    return new_y_pred

In [23]:
reviews('American airlines is the best airline in the US')

array(['positive'], dtype='<U8')

The review was predicted correctly

In [24]:
reviews('the southwest airlines fly to dallas everyday')

array(['positive'], dtype='<U8')

The review was incorrectly predicted.The review was neutral but predicted as positive 

In [26]:
reviews('I never fly in american airlines since its not good')

array(['positive'], dtype='<U8')

The review was predicted incorrectly as positive instead of negative.

In [27]:
# Implementing using Deep Learning model
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)
print(dummy_y)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


Using TensorFlow backend.


In [29]:
# splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, dummy_y, test_size = 0.20, random_state = 0)

In [34]:
# initialize the ANN model
model1 = tf.keras.models.Sequential()

In [35]:
# adding first layer
model1.add(tf.keras.layers.Dense(units=200, input_dim=12573, activation='relu'))
# adding second layer
model1.add(tf.keras.layers.Dense(units=200, activation='relu'))
# output layer
model1.add(tf.keras.layers.Dense(units=3, activation='softmax'))
# compiling the model
model1.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 200)               2514800   
_________________________________________________________________
dense_4 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 603       
Total params: 2,555,603
Trainable params: 2,555,603
Non-trainable params: 0
_________________________________________________________________


In [36]:
# training the model
model1.fit(X_train, y_train, batch_size = 32, epochs = 50)

Train on 11712 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f17d014db10>

In [39]:
# predicting if the review is positive, negative or neutral using deep learning model
def reviews1(new_review):
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower()
    new_review = new_review.split()
    stopword = stopwords.words('english')
    stopword.remove('not')
    new_review = [lemm.lemmatize(word) for word in new_review if word not in stopword]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    new_x_test = cv.transform(new_corpus).toarray()
    new_y_pred = model1.predict(new_x_test)
    x = new_y_pred.round()
    if x[0][0] == 1:
        print("The sentiment of the review is Negative")
    elif x[0][2] == 1:
        print("The sentiment of the review is Positive")
    else:
        print("The sentiment of the review is Neutral")

In [40]:
reviews1('American airlines is the best airline in the US')

The sentiment of the review is Positive


In [41]:
reviews1('the southwest airlines fly to dallas everyday')

The sentiment of the review is Neutral


In [42]:
reviews1('I never fly in american airlines since its not good')

The sentiment of the review is Negative


The reviews were correctly predicted by the model.