In [3]:
import pandas as pd
import numpy as np
import nltk
import contractions
from nltk import word_tokenize
from nltk.corpus import wordnet
# nltk.download('punkt')
import re
from bs4 import BeautifulSoup
from sklearn.decomposition import TruncatedSVD
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [4]:
df = pd.read_csv("Twitter.csv")
df

Unnamed: 0,clean_text,category,category_sentiment
0,i am not happy,-1,negative
1,i am not sad,1,positive
2,i'm fine,0,neutral
3,when modi promised “minimum government maximum...,-1,negative
4,talk all the nonsense and continue all the dra...,0,neutral
...,...,...,...
177971,'I'm not satisfied with The Hills finale. gon...,-1,negative
177972,this sucks,-1,negative
177973,this is bad,-1,negative
177974,I am not okay with this,-1,negative


In [5]:
df["category"].value_counts()

 1    72250
 0    62712
-1    43014
Name: category, dtype: int64

In [6]:
def text_transformation(text):
    text = " ".join(x.lower() for x in str(text).split())                             # Converting Text to Lowercase
    text = contractions.fix(text)                                                     # Fixes Contractions such as ("you're" to "you are" etc.)
    text = " ".join([re.sub("[^A-Za-z]+", "", x) for x in word_tokenize(text)])       # Removal of Punctuation, Numbers, and Special Characters                                                                  
    return text

In [7]:
df["processed_text"] = df["clean_text"].apply(text_transformation)
df

Unnamed: 0,clean_text,category,category_sentiment,processed_text
0,i am not happy,-1,negative,i am not happy
1,i am not sad,1,positive,i am not sad
2,i'm fine,0,neutral,i am fine
3,when modi promised “minimum government maximum...,-1,negative,when modi promised minimum government maximum...
4,talk all the nonsense and continue all the dra...,0,neutral,talk all the nonsense and continue all the dra...
...,...,...,...,...
177971,'I'm not satisfied with The Hills finale. gon...,-1,negative,i am not satisfied with the hills finale goin...
177972,this sucks,-1,negative,this sucks
177973,this is bad,-1,negative,this is bad
177974,I am not okay with this,-1,negative,i am not okay with this


In [8]:
x = df["processed_text"]
y = df["category_sentiment"]
y_strat = df["category"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y_strat)

print("Train:", x_train.shape, y_train.shape)
print("Test: ", x_test.shape, y_test.shape)

Train: (142380,) (142380,)
Test:  (35596,) (35596,)


In [10]:
vectorizer= TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [11]:
# # Reduce the dimensionality of the input data
svd = TruncatedSVD(n_components=2000)
x_train = svd.fit_transform(x_train)
x_test = svd.transform(x_test)

In [12]:
# Encode the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [13]:
# One-hot encode the target variable
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [14]:
# Define the deep learning model
model = Sequential()
model.add(Dense(2000, input_shape=(2000,), activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
unique_classes = np.unique(y_train)
n_classes = len(unique_classes)
print("Number of classes in the target variable: ", n_classes)

Number of classes in the target variable:  2


In [16]:
last_layer = model.layers[-1]
n_neurons = last_layer.output_shape[-1]
print("Number of neurons in the last dense layer: ", n_neurons)

Number of neurons in the last dense layer:  3


In [17]:
# Train the model
model.fit(x_train, y_train, epochs=20, batch_size=128, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x21ab7707cd0>

In [18]:
# Make predictions on the test set
y_pred = model.predict(x_test)



In [19]:
accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 87.49%


In [20]:
y_test

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [21]:
sentence = "i am anyone"

test_feature = vectorizer.transform([sentence])
test_feature = svd.transform(test_feature)
model.predict(test_feature)

sentiment = model.predict(test_feature)


def rev_one_hot(x):
    l = []
    m = [l]
    for i in x:
        for j in i:
            if j < 0.5:
                j = 0
                l.append(j)
            else:
                j = 1
                l.append(j)
    return m

sentiment = rev_one_hot(sentiment)
print(sentiment)

def to_categ(pred):

    if pred == [[1, 0, 0]]:
        return "negative"
    
    elif pred == [[0, 0, 1]]:
        return "positive"

    else:
        return "neutral"

to_categ(sentiment)


[[0, 1, 0]]


'neutral'

In [22]:
pickl = {
        "vectorizer": vectorizer,
         "svd": svd,
         "model": model
         }
pickle.dump(pickl, open('dl_model'+".p", "wb"))

INFO:tensorflow:Assets written to: ram://c70b93de-151d-41b0-a3c5-9d5c5eabe969/assets
