In [48]:
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D, Embedding, SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np

In [49]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [50]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/arnav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
file="./archive/training.1600000.processed.noemoticon.csv"

In [52]:
df=pd.read_csv(file,encoding='ISO-8859-1',header=None)

In [53]:
df.columns = ['sentiment','id','date','query','username','text']

In [54]:
df.head()

Unnamed: 0,sentiment,id,date,query,username,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [55]:
df.drop(['id','date','query','username'],axis=1,inplace=True)

In [56]:
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [57]:
df.isna().sum()

sentiment    0
text         0
dtype: int64

In [58]:
df['sentiment'].unique()

array([0, 4])

In [59]:
df['sentiment']=df['sentiment'].replace(4,1)

In [60]:
df.sentiment.value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [61]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [62]:
def preprocess(text, stem=False, lemmatize=False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    cleaned_text = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                if lemmatize:
                    cleaned_text.append(lemmatizer.lemmatize(stemmer.stem(token)))
                else:
                    cleaned_text.append(stemmer.stem(token))
            elif lemmatize:
                    cleaned_text.append(lemmatizer.lemmatize(stemmer.stem(token)))
            else:
                cleaned_text.append(token)
    return " ".join(cleaned_text)

In [63]:
df['text'] = df['text'].apply(lambda x: preprocess(x,stem=False,lemmatize=False))

In [64]:
df['text']

0               awww bummer shoulda got david carr third day
1          upset update facebook texting might cry result...
2          dived many times ball managed save 50 rest go ...
3                           whole body feels itchy like fire
4                                           behaving mad see
                                 ...                        
1599995                        woke school best feeling ever
1599996             thewdb com cool hear old walt interviews
1599997                      ready mojo makeover ask details
1599998    happy 38th birthday boo alll time tupac amaru ...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: text, Length: 1600000, dtype: object

In [65]:
MAX_SEQUENCE_LENGTH = 30

In [66]:
train_data, test_data = train_test_split(df, train_size=0.8,
                                         random_state=7,) # Splits Dataset into Training and Testing set
print("Train Data size:", len(train_data))
print("Test Data size", len(test_data))

Train Data size: 1280000
Test Data size 320000


In [67]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=True,)
tokenizer.fit_on_texts(train_data['text'])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 290576


In [91]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
#with open('tokenizer.pickle', 'rb') as handle:
#    tokenizer = pickle.load(handle)

In [68]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train_data['text'],),
                        maxlen = MAX_SEQUENCE_LENGTH)
x_test = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_data['text']),
                       maxlen = MAX_SEQUENCE_LENGTH)

print("Training X Shape:",x_train.shape)
print("Testing X Shape:",x_test.shape)

Training X Shape: (1280000, 30)
Testing X Shape: (320000, 30)


In [69]:
y_train=train_data['sentiment']
y_test=test_data['sentiment']

In [70]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip

In [71]:
embeddings_index = {}

f = open('/home/arnav/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = value = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

Found 400000 word vectors.


In [72]:
embedding_matrix = np.zeros((vocab_size, 300))
hit=0
miss=0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        hit+=1
        embedding_matrix[i] = embedding_vector
    else:
        miss+=1
print("hit:",hit,"\nmiss:",miss)

hit: 91398 
miss: 199177


In [73]:
embedding_layer = Embedding(vocab_size,
                                          300,
                                          weights=[embedding_matrix],
                                          input_length=MAX_SEQUENCE_LENGTH,
                                          trainable=False)

In [127]:
model=tf.keras.models.Sequential()
model.add(Input(shape=(MAX_SEQUENCE_LENGTH)))
model.add(Embedding(vocab_size, 50, name="embedding"))
#model.add(embedding_layer)
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(512,activation='relu'))
#model.add(Dropout(0.5))
#model.add(Dense(512,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
#model.add(Dense(2,activation='softmax'))



In [128]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
#model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'],)
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, monitor = 'val_loss', verbose = 1)
valLoss=tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)
save=tf.keras.callbacks.ModelCheckpoint('./model/model_{val_accuracy:.3f}.h5',save_best_only=True,save_weights_only=False,monitor='val_accuracy')

In [129]:
history = model.fit(x_train, y_train, batch_size=2048, epochs=10,
                    validation_data=(x_test, y_test), callbacks=[ReduceLROnPlateau,valLoss,save])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [130]:
model=tf.keras.models.load_model("./model/model_0.786.h5")
model.save("my_h5_model.h5")



In [122]:
model1=tf.keras.models.Sequential()
model1.add(Input(shape=(MAX_SEQUENCE_LENGTH)))
model1.add(Embedding(vocab_size, 50, name="embedding"))
#model1.add(embedding_layer)
model1.add(SpatialDropout1D(0.2))
model1.add(Conv1D(64, 5, activation='relu'))
model1.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model1.add(Dense(512,activation='relu'))
#model1.add(Dropout(0.5))
#model1.add(Dense(512,activation='relu'))
#model1.add(Dense(1,activation='sigmoid'))
model1.add(Dense(2,activation='softmax'))



In [123]:
#model1.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
model1.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'],)
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, monitor = 'val_loss', verbose = 1)
valLoss=tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)
save=tf.keras.callbacks.ModelCheckpoint('./model_new_embed/model_{val_accuracy:.3f}.h5',save_best_only=True,save_weights_only=False,monitor='val_accuracy')

In [124]:
history = model1.fit(x_train, y_train, batch_size=2048, epochs=10,
                    validation_data=(x_test, y_test), callbacks=[ReduceLROnPlateau,valLoss,save])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [125]:
model1=tf.keras.models.load_model("./model_new_embed/model_0.786.h5")
model1.save("my_h5_model1.h5")



In [126]:
model1.evaluate(x_test, y_test)



[0.45744165778160095, 0.7859343886375427]

In [93]:
model2=tf.keras.models.Sequential()
model2.add(Input(shape=(MAX_SEQUENCE_LENGTH)))
model2.add(Embedding(vocab_size, 50, name="embedding"))
#model2.add(embedding_layer)
model2.add(SpatialDropout1D(0.2))
model2.add(Conv1D(64, 5, activation='elu'))
model2.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model2.add(Dense(512,activation='elu'))
#model2.add(Dropout(0.5))
#model2.add(Dense(512,activation='relu'))
#model2.add(Dense(1,activation='sigmoid'))
model2.add(Dense(2,activation='softmax'))



In [94]:
#model2.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
model2.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'],)
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, monitor = 'val_loss', verbose = 1)
valLoss=tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)
save=tf.keras.callbacks.ModelCheckpoint('./model_new_embed_elu/model_{val_accuracy:.3f}.h5',save_best_only=True,save_weights_only=False,monitor='val_accuracy')

In [95]:
history = model2.fit(x_train, y_train, batch_size=2048, epochs=10,
                    validation_data=(x_test, y_test), callbacks=[ReduceLROnPlateau,valLoss,save])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [96]:
model2=tf.keras.models.load_model("./model_new_embed_elu/model_0.784.h5")
model2.save("my_h5_model2.h5")



In [97]:
modelelu=tf.keras.models.load_model("my_h5_model2.h5")



In [98]:
handle=open('tokenizer.pickle', 'rb')
tokenizer1 = pickle.load(handle)

In [99]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/arnav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [100]:
stop_words_test = stopwords.words('english')
text_cleaning_re_test = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [102]:
test_text="This is a nice car"

In [103]:
def preprocess_test(text):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    cleaned_text = []
    for token in text.split():
        if token not in stop_words_test:
                cleaned_text.append(token)
    return " ".join(cleaned_text)

In [116]:
text_test=preprocess_test(test_text)

In [117]:
text_test

'nice car'

In [118]:
text_test=tf.keras.preprocessing.sequence.pad_sequences(tokenizer1.texts_to_sequences([text_test],),
                        maxlen = 30)

In [119]:
text_test

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,  61, 209]], dtype=int32)

In [120]:
modelelu.predict(text_test)

array([[0.2666841 , 0.73331594]], dtype=float32)