In [0]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import numpy as np
import re
import gensim
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [0]:
nltk.download('stopwords')

In [0]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z]+"
DATASET_ENCODING = "ISO-8859-1"

In [0]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [0]:
df = pd.read_csv('/content/drive/My Drive/tweets_sentiment.csv',encoding=DATASET_ENCODING, header=None)

In [0]:
abbr={"ain't": "is not", "isn't": 'is not',"cannot": "can not", "aren't": "are not", "can't": "can not", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did",
                "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
                "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would",
                "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
                "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "wont": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color',
                'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', '2F4U':	'Too Fast For You',
                'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I',
                'theBest': 'the best', 'howdoes': 'how does', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
                'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'AKA':	'also known as', 'ACK':	'Acknowledgment', 'AFAIK':	'As far as I know', 'AFAIR':	'As far as I remember'}

abbr = {k.lower(): v.lower() for k, v in abbr.items()}

In [0]:
def normalize_word(word):
    temp = word
    while True:
        w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
        if (w == temp):
            break
        else:
            temp = w
    return w

In [0]:
def preprocess(text,abbreviation_dict=abbr, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()

    filtered_text=[]
    for t in text.split(' '):
        if (t in abbreviation_dict):
            tok = abbreviation_dict.get(t).split(' ')
            filtered_text.extend(tok)
        else:
            filtered_text.append(t)

    tokee=[normalize_word(word) for word in filtered_text]
    tokee=[t for t in tokee if t not in stop_words]

    tokens = []
    for token in tokee:
        if stem:
            tokens.append(stemmer.stem(token))
        else:
            tokens.append(token)
    ll=['b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    tokens=[g for g in tokens if g not in ll]
    if len(tokens)>=3:
        return " ".join(tokens)
    else:
        return float('NaN')
df[5] = df[5].apply(lambda x: preprocess(x))

In [0]:
df=df.dropna(axis=0,how='any')

In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df[5])

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)


Total words 290795


In [11]:
max([len(s.split(' ')) for s in list(df[5]) ])

50

In [0]:
X = pad_sequences(tokenizer.texts_to_sequences(df[5]), maxlen=200)

In [0]:
labels = df[0].unique().tolist()
labels.append('NEUTRAL')
encoder = LabelEncoder()
encoder.fit(df[0].tolist())
y = encoder.transform(df[0].tolist())
y = y.reshape(-1,1)

In [14]:
embeddings_index = dict()
f = open('/content/drive/My Drive/glove.6B.300d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, Embedding
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import tensorflow

In [0]:
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=200, trainable=False)

In [18]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 300)          87238500  
_________________________________________________________________
conv1d (Conv1D)              (None, 198, 128)          115328    
_________________________________________________________________
dropout (Dropout)            (None, 198, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 196, 256)          98560     
_________________________________________________________________
flatten (Flatten)            (None, 50176)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50176)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               6

In [0]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [20]:
history = model.fit(X,y,
                    batch_size=2048,
                    epochs=10,
                    validation_split=0.2,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
# PLOT MODEL
tf.keras.utils.plot_model(
    model,
    to_file="model.png",
    show_shapes=True,
    show_layer_names=False,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
)

In [0]:
model.save('/content/drive/My Drive/CNNsentiment.h5') 

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/CNNsentiment.h5')

In [0]:
df1 = pd.read_csv('/content/drive/My Drive/sarcastic_comments_final.csv')

In [0]:
def preprocess(text,abbreviation_dict=abbr, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()

    filtered_text=[]
    for t in text.split(' '):
        if (t in abbreviation_dict):
            tok = abbreviation_dict.get(t).split(' ')
            filtered_text.extend(tok)
        else:
            filtered_text.append(t)

    tokee=[normalize_word(word) for word in filtered_text]
    tokee=[t for t in tokee if t not in stop_words]

    tokens = []
    for token in tokee:
        if stem:
            tokens.append(stemmer.stem(token))
        else:
            tokens.append(token)
    ll=['b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    tokens=[g for g in tokens if g not in ll]
    if len(tokens)>=1:
        return " ".join(tokens)
    else:
        return float('NaN')
df1.comment = df1.comment.apply(lambda x: preprocess(x))

In [0]:
df1=df1.dropna(axis=0,how='any')

In [0]:
X1 = pad_sequences(tokenizer.texts_to_sequences(df1.comment), maxlen=200)

In [0]:
import tensorflow
new_input = model.input
hidden_layer = model.layers[-2].output

features_extract_model = tensorflow.keras.Model(new_input, hidden_layer)
features_extract_model.trainable=False

In [0]:
tf.executing_eagerly()

True

In [0]:
I=[i for i in range(0,X1.shape[0],1000)]
I.append(X1.shape[0])

In [0]:
NDF=pd.DataFrame(columns=[str(t) for t in range(128)])
for i in range(len(I[0:-1])):
    x=tf.convert_to_tensor(X1[I[i]:I[i+1]])
    z=features_extract_model(x)
    ndf=pd.DataFrame(z.numpy(),columns=[str(t) for t in range(128)])
    lst = [NDF, ndf]
    NDF = pd.concat(lst)

In [0]:
NDF.to_csv('/content/drive/My Drive/sentiment_extracted.csv')