# Twitter  sentiments by Neural Networks and Keras

## Building the network architecture

In [0]:
from keras.models import Sequential
from keras.layers import Dense

In [0]:
clf_model = Sequential()

In [0]:
clf_model.add(Dense(units=300 , activation='relu' , input_dim=3000))
clf_model.add(Dense(units=20, activation='relu'))
clf_model.add(Dense(units=3 , activation='relu'))

In [0]:
from keras import optimizers

In [0]:
adm = optimizers.Adamax(lr = 0.0001)
clf_model.compile(optimizer = adm , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

## Getting the data from drive

In [0]:
from google.colab import drive

In [314]:
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
path_test = '/content/gdrive/My Drive/Project Twitter sentiments/test_twitter_x_test.csv'
path_train = '/content/gdrive/My Drive/Project Twitter sentiments/training_twitter_x_y_train.csv'

In [0]:
import pandas as pd

In [317]:
df_train = pd.DataFrame(pd.read_csv(path_train))
df_test = pd.DataFrame(pd.read_csv(path_test))
df_train.columns , df_test.columns

(Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
        'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
        'tweet_created', 'tweet_location', 'user_timezone'],
       dtype='object'),
 Index(['tweet_id', 'airline', 'airline_sentiment_gold', 'name',
        'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
        'tweet_created', 'tweet_location', 'user_timezone'],
       dtype='object'))

In [0]:
train_tweets = df_train['text']
y_train = df_train['airline_sentiment']
test_tweets = df_test['text']

In [319]:
y_train.unique()

array(['negative', 'positive', 'neutral'], dtype=object)

So we have 3 units in output layer

In [320]:
from nltk import tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
import string
from nltk import pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def tokenized_data(data):
  tokenised_word_data = []
  for datapoint in data : 
    words_dp = tokenize.word_tokenize(datapoint)
    tokenised_word_data.append(words_dp)
  return tokenised_word_data

In [0]:
train_data , test_data = tokenized_data(train_tweets) , tokenized_data(test_tweets)

In [0]:
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation))

lemmatizer = WordNetLemmatizer()

In [0]:
def getsimplepos(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  elif tag.startswith('R'):
    return wordnet.ADV
  elif tag.startswith('V'):
    return wordnet.VERB
  else:
    return wordnet.NOUN

In [0]:
def cleaning(words):
  output_words = []
  for w in words : 
    if w.lower() not in stop_words:
      pos = pos_tag([w])
      new_word = lemmatizer.lemmatize(w, pos = getsimplepos(pos[0][1]))
      output_words.append(new_word.lower())
  return output_words

In [0]:
xtrain_words = [cleaning(words) for words in train_data]
xtest_words = [cleaning(words) for words in test_data]

In [0]:
x_train = [" ".join(line) for line in xtrain_words]
x_test = [" ".join(line) for line in xtest_words]

In [328]:
x_train[0],x_test[0]

('southwestair schedule morning 2 day fact yes..not sure even flight one cancelled flightled',
 "americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2")

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(max_features = 3000 , ngram_range= (1,2))

In [0]:
x_train_final = count_vec.fit_transform(x_train)
x_test_final = count_vec.transform(x_test)

In [0]:
import numpy as np
X_train_for_neural = np.array(x_train_final.todense())
X_test_for_neural = np.array(x_test_final.todense())

In [0]:
def onehotencode(y_data):
  y_output_data = []
  for y in y_data :
    y_output = [0 for i in range(3)]
    if y == 'poistive':
      y_output[0] = 1
    elif y == 'negative' :
      y_output[1] = 1
    else:
      y_output[2] = 1
    y_output_data.append(y_output)
  return y_output_data

In [333]:
np.array(onehotencode(y_train)).shape

(10980, 3)

In [334]:
clf_model.fit(X_train_for_neural,np.array(onehotencode(y_train)) , epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f980d6edc88>

In [0]:
def reversehotencode(encoded_y):
  y_final = []
  for arr in encoded_y :
    idx = np.argmax(arr)
    if idx == 0 :
      y_final.append('positive')
    elif idx == 1:
      y_final.append('negative')
    else:
      y_final.append('neutral')
  return y_final

In [0]:
hot_encoded_Ypred = clf_model.predict(X_test_for_neural)
Ypred = reversehotencode(hot_encoded_Ypred)

In [0]:
np.savetxt('predictions_from_neural_keras.csv', Ypred , fmt = '%s')