In [0]:
import numpy as np 
import pandas as pd 
import nltk
from google.colab import drive
drive.mount('/content/gdrive')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
from keras.utils import to_categorical
import random
from tensorflow import set_random_seed
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
lemmatizer = WordNetLemmatizer()

#set random seed for the session and also for tensorflow that runs in background for keras
set_random_seed(123)
random.seed(123)
train= pd.read_csv("gdrive/My Drive/train.csv")
test = pd.read_csv("gdrive/My Drive/test_data.csv")



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Using TensorFlow backend.
  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
train.head()
train.shape
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['text']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(len(train_sentences))
print(len(test_sentences))

100%|██████████| 400000/400000 [08:15<00:00, 807.04it/s]
100%|██████████| 100000/100000 [02:05<00:00, 799.13it/s]

400000
100000





In [0]:
train= pd.read_csv("gdrive/My Drive/train.csv")
train = train.drop(273514,axis=0)
train['label'] = pd.to_numeric(train.label, errors='coerce')
test = pd.read_csv("gdrive/My Drive/test_data.csv")
train.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,label,text
0,1,Great mobile app with nice reward program. Mak...
1,2,Really fast and polite. Definitely recommend. ...
2,2,"This place is always amazing, friendly staff a..."
3,1,We did a Wine 101 class on a Friday night. Coo...
4,1,I am rounding up because I think this place ma...


In [0]:
target=train.label.values
y_target=to_categorical(target)
num_classes=y_target.shape[1]
X_train,X_val,y_train,y_val=train_test_split(train_sentences,y_target,test_size=0.2,stratify=y_target)
 #It is needed for initializing tokenizer of keras and subsequent padding

unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words gives the no of unique words
print(len(list(unique_words)))
print(len_max)

100%|██████████| 320000/320000 [00:02<00:00, 115808.71it/s]

139494
1029





In [0]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

#padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
#Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.
X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)
print(X_train.shape,X_val.shape,X_test.shape)

(320000, 1029) (80000, 1029) (100000, 1029)


In [0]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

#Model using Keras LSTM
model=Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()

W0619 08:43:09.904083 139628863997824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0619 08:43:09.928479 139628863997824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0619 08:43:09.931909 139628863997824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0619 08:43:10.037517 139628863997824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0619 08:43:10.047717 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1029, 300)         41848200  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1029, 128)         219648    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 42,124,059
Trainable params: 42,124,059
Non-trainable params: 0
________________________________________________________________

In [0]:
#This is done for learning purpose only. One can play around with different hyper parameters combinations
#and try increase the accuracy even more. For example, a different learning rate, an extra dense layer 
# before output layer, etc. Cross validation could be used to evaluate the model and grid search 
# further to find unique combination of parameters that give maximum accuracy. This model has a validation
#accuracy of around 66.5%
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=1, batch_size=180, verbose=1, callbacks=callback)

W0619 08:43:10.834344 139628863997824 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 320000 samples, validate on 80000 samples
Epoch 1/1


In [0]:
train_downloaded = drive.CreateFile({'id': '1Qtn7zg3nJyHBxK9W4_UQ4n3YE8wZCanM'})
train_downloaded.GetContentFile('train.csv')
test_downloaded = drive.CreateFile({'id': '1Qtn7zg3nJyHBxK9W4_UQ4n3YE8wZCanM'})
test_downloaded.GetContentFile('test.csv')
sample_downloaded = drive.CreateFile({'id': '11qHE-xKXeeG1otqjm7nS25N2iGAzSUmy'})
sample_downloaded.GetContentFile('sampleTrain.csv')

In [0]:
tick = time.time()
df = pd.read_csv("train.csv", dtype={"label": object, "text": object})
y = df.label 
text = df.text

print(df.head())
#Convert all text to lower case
text = text.apply(lambda x: " ".join(x.lower() for x in str(x).split()))

#Remove all punctuation
for i in range(0, len(text)):
    text[i] = re.sub(r'[^\w\s]', "", text[i])
    
#for index,row in df.iterrows():
#    if df.iloc[index]['text'] == 'My husband and I had not purchased a home before and we definitely needed some hand holding. They were patient and professional. We got our dream home and the entire experience was awesome! Thank you so much ladies for a job well done!':
#        df.drop(index, inplace=True)

#Remove stop words (removes important words! needs modification)
stop = stopwords.words("english")
stop = stop[:143]
stop.remove("not")
stop.remove("against")
stop.remove("no")
stop.append("My husband and I had not purchased a home before and we definitely needed some hand holding. They were patient and professional. We got our dream home and the entire experience was awesome! Thank you so much ladies for a job well done! ")
text = text.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))

#Lemmatize (not working)
#text = text.apply(lambda x: " ".join(Word(x).lemmatize() for x in x.split()))

#Stemming

#text.drop("My husband and I had not purchased a home before and we definitely needed some hand holding. They were patient and professional. We got our dream home and the entire experience was awesome! Thank you so much ladies for a job well done! ")
#seperating words into lists
for i, line in enumerate(text):
    text[i] = line.split()
#print(text)
#df.text = text
#text = list(text)
print(time.time() - tick)

In [0]:
df = df.apply(pd.to_numeric, args=('coerce',))

In [0]:
X = text

Y = df['label']
X

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [0]:
max_features = 20000
#tokenizer = Tokenizer(num_words=max_features)
tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ', lower=True,split=' ')
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)


In [0]:
X = tokenizer.texts_to_sequences(text)
X = pad_sequences(X)


Y = pd.get_dummies(y.values)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(y.values)
print(Y_train.values)


In [0]:




maxlen = 100
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
X.reshape(-1,1)

In [0]:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size = 0.2)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(y.values)

In [0]:
Y_train=Y_train.drop('My husband and I had not purchased a home before and we definitely needed some hand holding. They were patient and professional. We got our dream home and the entire experience was awesome! Thank you so much ladies for a job well done!',axis = 1)


In [0]:

Y_test.reshape(-1,1)
X_test.reshape(-1,1)

In [0]:

maxlen=571
inp = Input(shape=(maxlen, ))
embed_size = 300
#x = Sequential()
x = Embedding(max_features, embed_size)(inp)
x = LSTM(300, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(256, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(64, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 571)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 571, 300)          6000000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 571, 300)          721200    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 300)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               77056     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
__________

In [0]:
print("start fitting...")
model.fit(X_train, Y_train ,validation_data=(X_test,Y_test), epochs=2,verbose=1, batch_size=1000)

In [0]:
scores = model.evaluate(X_te, y_test)
print("\n%s: %.4f%%" % (model.metrics_names[1], scores[1]*100))
y_pred = model.predict(X_te, batch_size=8000)
y_classes = y_pred.argmax(axis=-1)

In [0]:
z=np.array(y_test)
d=y_test.astype(int)
d
z=np.array(d)
z[0]
Y_TEST=z

In [0]:

accScore = metrics.accuracy_score(Y_TEST,y_classes)

lbl = [0,1,2]
precision = metrics.precision_score(Y_TEST,y_classes,average=None,labels=lbl)
recall = metrics.recall_score(Y_TEST,y_classes,average=None,labels=lbl)
f1Score = metrics.f1_score(Y_TEST,y_classes,average=None,labels=lbl)

print("\nOverall Acurracy: ",accScore,"\n")

for i in range(len(lbl)):
    print("Precision of %s class: %f" %(lbl[i],precision[i]))
    print("Recall of %s class: %f" %(lbl[i],recall[i]))
    print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")


In [0]:
import utils

In [0]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
kernel_size=3
embedding_matrix = np.random.randn(20000, 128) * 0.01

In [0]:
inp = Input(shape=(100, ))
embed_size = 128
x = Embedding(20000,128,weights=[embedding_matrix])(inp)#input_length=maxlen)(inp)
x=(Dropout(0.4))(x)
x=(Conv1D(600, kernel_size, padding='valid', activation='relu', strides=1))(x)
x=(Conv1D(300, kernel_size, padding='valid', activation='relu', strides=1))(x)
x=(Conv1D(150, kernel_size, padding='valid', activation='relu', strides=1))(x)
x=(Conv1D(75, kernel_size, padding='valid', activation='relu', strides=1))(x)
x=(Flatten())(x)
x=(Dense(600))(x)
x=(Dropout(0.5))(x)
x=(Activation('relu'))(x)
x=(Dense(1))(x)
x=(Activation('softmax'))(x)
model=Model(inputs=inp, outputs=x)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.summary()

In [0]:
kaggle competitions submit -c sent -f submission.csv -m "Message"