In [25]:
# System
import os

# Time
import time
import datetime

# Numerical
import numpy as np
import pandas as pd

# Tools
import itertools
from collections import Counter

# NLP
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from pywsd.utils import lemmatize_sentence

# Preprocessing
from sklearn import preprocessing
from sklearn.utils import class_weight as cw
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup

# Model Selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metrics
from sklearn import metrics 
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report

# Deep Learing Preprocessing - Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical

# Deep Learning Model - Keras
from keras.models import Model
from keras.models import Sequential

# Deep Learning Model - Keras - CNN
from keras.layers import Conv1D, Conv2D, Convolution1D, MaxPooling1D, SeparableConv1D, SpatialDropout1D, \
    GlobalAvgPool1D, GlobalMaxPool1D, GlobalMaxPooling1D 
from keras.layers.pooling import _GlobalPooling1D
from keras.layers import MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D

from keras.layers import MaxPooling3D, GlobalMaxPooling3D, GlobalAveragePooling3D



# Deep Learning Model - Keras - RNN
from keras.layers import Embedding, LSTM, Bidirectional

# Deep Learning Model - Keras - General
from keras.layers import Input, Add, concatenate, Dense, Activation, BatchNormalization, Dropout, Flatten
from keras.layers import LeakyReLU, PReLU, Lambda, Multiply



# Deep Learning Parameters - Keras
from keras.optimizers import RMSprop, Adam

# Deep Learning Callbacs - Keras
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
print(os.listdir("input"))

['train.tsv', 'test_samples.csv', 'train_preprocessed.csv']


In [3]:
# Loading Data
rev_frame = pd.read_csv("input/train_preprocessed.csv")
t_id = pd.read_csv("input/test_samples.csv")

In [4]:
df=rev_frame.copy()

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,clean_tweet,sentiment
0,0,gas by my house hit am going to chapel hill on...,positive
1,1,theo walcott is still shit watch rafa and john...,negative
2,2,its not that am gsp fan just hate nick diaz ca...,negative
3,3,iranian general says israel iron dome cannot d...,negative
4,4,tehran mon amour obama tried to establish ties...,neutral


In [6]:
df=df[['clean_tweet','sentiment']]

In [7]:
df = df[:21629]

In [8]:
print(df.shape)
df.head()

(21629, 2)


Unnamed: 0,clean_tweet,sentiment
0,gas by my house hit am going to chapel hill on...,positive
1,theo walcott is still shit watch rafa and john...,negative
2,its not that am gsp fan just hate nick diaz ca...,negative
3,iranian general says israel iron dome cannot d...,negative
4,tehran mon amour obama tried to establish ties...,neutral


In [9]:
# check for null values
df.isnull().sum()

clean_tweet    0
sentiment      0
dtype: int64

In [10]:
df.head()

Unnamed: 0,clean_tweet,sentiment
0,gas by my house hit am going to chapel hill on...,positive
1,theo walcott is still shit watch rafa and john...,negative
2,its not that am gsp fan just hate nick diaz ca...,negative
3,iranian general says israel iron dome cannot d...,negative
4,tehran mon amour obama tried to establish ties...,neutral


In [11]:
df['sentiment'].value_counts()

positive    9155
neutral     9074
negative    3400
Name: sentiment, dtype: int64

In [12]:
# function to lematize the words
def clean_tweet(tweet):
    word_tokens= tweet.lower().split()
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_tweet=" ".join(word_tokens)
    return cleaned_tweet

In [13]:
print(df.shape)
df.head()

(21629, 2)


Unnamed: 0,clean_tweet,sentiment
0,gas by my house hit am going to chapel hill on...,positive
1,theo walcott is still shit watch rafa and john...,negative
2,its not that am gsp fan just hate nick diaz ca...,negative
3,iranian general says israel iron dome cannot d...,negative
4,tehran mon amour obama tried to establish ties...,neutral


In [14]:
# shuffling rows
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)  # perfectly fine.
df.head()

(21629, 2)


Unnamed: 0,clean_tweet,sentiment
0,it will be some scene at the nyse and nasdaq s...,neutral
1,snoop dogg son cordell broadus quits ucla foot...,neutral
2,so everyone was so happy bc michael jackson ca...,positive
3,bulls season tips off wednesday night vs sacra...,negative
4,at no is jurassic world th installment of the ...,neutral


In [16]:
df['clean_tweet']=df['clean_tweet'].apply(clean_tweet)

In [18]:
input_directory = r"../input/"
output_directory = r"../output/"

if not os.path.exists(output_directory):
    os.mkdir(output_directory)
    
figure_directory = "../output/figures"
if not os.path.exists(figure_directory):
    os.mkdir(figure_directory)

In [26]:
X = df['clean_tweet']
Y = df['sentiment']

label_encoder = LabelEncoder()

Y = label_encoder.fit_transform(Y)

Y = to_categorical(Y)

# Y = Y.reshape(-1, 1)
Y

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

max_words = len(set(" ".join(X_train).split()))
max_len = X_train.apply(lambda x: len(x)).max()

# max_words = 1000
# max_len = 150
max_words, max_len

(28777, 150)

In [28]:
tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_seq = sequence.pad_sequences(X_train_seq, maxlen=max_len)

In [29]:
# Calculate Class Weights
def get_weight(y):
    class_weight_current =  cw.compute_class_weight('balanced', np.unique(y), y)
    return class_weight_current

In [30]:
class_weight = get_weight(Y_train.flatten())

In [31]:
def get_rnn_model(num_class=2):
    model = Sequential()
    
    model.add(Embedding(max_words, 100, input_length=max_len))
    model.add(LSTM(256))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(512, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    if num_class>2:
        model.add(Dense(num_class, activation='softmax'))
    else:
        model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    
    return model


def get_cnn_model(num_class=2):   
    model = Sequential()
    
    model.add(Embedding(max_words, 100, input_length=max_len))
    
    model.add(Conv1D(1024, 3, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(2048, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    if num_class>2:
        model.add(Dense(num_class, activation='softmax'))
    else:
        model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    return model

In [39]:
def plot_performance(history=None, figure_directory=None, ylim_pad=[0, 0]):
    xlabel = 'Epoch'
    legends = ['Training', 'Validation']

    plt.figure(figsize=(20, 5))

    y1 = history.history['acc']
    y2 = history.history['val_acc']

    min_y = min(min(y1), min(y2))-ylim_pad[0]
    max_y = max(max(y1), max(y2))+ylim_pad[0]


    plt.subplot(121)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Accuracy\n', fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Accuracy', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()

    y1 = history.history['loss']
    y2 = history.history['val_loss']

    min_y = min(min(y1), min(y2))-ylim_pad[1]
    max_y = max(max(y1), max(y2))+ylim_pad[1]


    plt.subplot(122)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Loss\n', fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Loss', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()
    if figure_directory:
        plt.savefig(figure_directory+"/history")

    plt.show()

In [43]:
num_class = 3
model = get_rnn_model(num_class=num_class)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 100)          2877700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               131584    
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)              

In [44]:
loss = 'categorical_crossentropy'
# loss = 'binary_crossentropy'
metrics = ['accuracy']

In [45]:
print("Starting...\n")

print("\n\nCompliling Model ...\n")
learning_rate = 0.001
optimizer = Adam(learning_rate)
# optimizer = Adam()

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

verbose = 1
epochs = 20
batch_size = 16
validation_split = 0.2

print("Trainning Model ...\n")

history1 = model.fit(
    X_train_seq,
    Y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    validation_split=validation_split,
    class_weight =class_weight
    )

Starting...



Compliling Model ...

Trainning Model ...

Train on 14707 samples, validate on 3677 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

KeyboardInterrupt: 

In [40]:
#X_train_seq = tokenizer.texts_to_sequences(X_train)
#X_train_seq = sequence.pad_sequences(X_train_seq, maxlen=max_len)

KeyError: 'acc'

<Figure size 1440x360 with 0 Axes>

In [46]:
test = rev_frame.copy()
test = test[['clean_tweet']]

In [47]:
test = test[21630:]

In [48]:
test = test[['clean_tweet']]
test = test.reset_index(drop=True)
test

Unnamed: 0,clean_tweet
0,down in the atlantic city ventnor margate ocea...
1,musical awareness great big beautiful tomorrow...
2,on radio fm fri oct labour analyst shawn hatti...
3,kapan sih lo ngebuktiin jan ngomong doang susa...
4,excuse the connectivity of this live stream fr...
...,...
5393,it has it is wednesday girls night out as band...
5394,night college course sorted just have to enrol...
5395,for the st time in years for your splendiferou...
5396,nurses day may nursing the heart beat of the h...


In [49]:
t_id = t_id[['tweet_id']]
t_id = t_id.reset_index(drop=True)
test_data = pd.concat([t_id, test], axis = 1)

In [50]:
test_data['clean_tweet']=test_data['clean_tweet'].apply(clean_tweet)
test_data

Unnamed: 0,tweet_id,clean_tweet
0,264238274963451904,atlantic city ventnor margate ocean city area ...
1,218775148495515649,musical awareness great big beautiful tomorrow...
2,258965201766998017,radio fm fri oct labour analyst shawn hattingh...
3,262926411352903682,kapan sih lo ngebuktiin jan ngomong doang susa...
4,171874368908050432,excuse connectivity live stream baba amr many ...
...,...,...
5393,210378118865756160,wednesday girl night band wilson phillips fill...
5394,245177521304399872,night college course sorted enrole tomorrow fi...
5395,259280987089932288,st time year splendiferous entertainment art l...
5396,201113950211940352,nurse day may nursing heart beat health


In [51]:
train_seq = tokenizer.texts_to_sequences(test_data['clean_tweet'])
train_seq = sequence.pad_sequences(train_seq, maxlen=max_len)

In [55]:
prediction = model.predict_classes(train_seq)

In [56]:
prediction

array([2, 2, 1, ..., 1, 2, 2])

In [60]:
def class_sentiment(prediction):
    n = len(prediction)
    sentiment = []
    for i in range(n):
        if(prediction[i] == 2):
            sentiment.append('positive')
        elif(prediction[i]==1):
            sentiment.append('neutral')
        else:
            sentiment.append('negative')
    return sentiment
sentiment = class_sentiment(prediction)
test_data['sentiment'] = sentiment
test_data['sentiment'].value_counts()

neutral     3198
positive    2029
negative     171
Name: sentiment, dtype: int64

In [61]:
test_data = test_data[['tweet_id','sentiment']]
test_list = []
heading = ['tweet_id', 'sentiment']
test_list.append(heading)
for i in range(len(test_data['tweet_id'])):
    sub = []
    sub.append(test_data['tweet_id'][i])
    sub.append(test_data['sentiment'][i])
    test_list.append(sub)

In [62]:
len(test_list)

5399

In [64]:
import csv
with open('rnn.csv', 'w', newline='') as fp:
    a = csv.writer(fp, delimiter = ",")
    data = test_list
    a.writerows(data)
check = pd.read_csv("test_result.csv")
check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 2 columns):
tweet_id     5398 non-null int64
sentiment    5398 non-null object
dtypes: int64(1), object(1)
memory usage: 84.5+ KB
