In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
import re
from sklearn.decomposition import TruncatedSVD
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv("../Twitter.csv")
df.head()

Unnamed: 0,clean_text,category,category_sentiment
0,when modi promised “minimum government maximum...,-1,negative
1,talk all the nonsense and continue all the dra...,0,neutral
2,what did just say vote for modi welcome bjp t...,1,positive
3,asking his supporters prefix chowkidar their n...,1,positive
4,answer who among these the most powerful world...,1,positive


In [5]:
df["category"].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [6]:
df = df.sample(frac=1).reset_index(drop=True) #shuffling of tweets
data = df[df["category"] == -1][:40000]
data = data.append(df[df["category"] == 0][:40000])
data=data.append(df[df["category"] == 1][:40000])
data = data.reset_index(drop=True)
display(data["category"].value_counts())
data

  data = data.append(df[df["category"] == 0][:40000])
  data=data.append(df[df["category"] == 1][:40000])


-1    40000
 0    40000
 1    40000
Name: category, dtype: int64

Unnamed: 0,clean_text,category,category_sentiment
0,'@Cyhuntastic I wish I could have gone ',-1,negative
1,you are afraid indian tiger mrmodi didi,-1,negative
2,that would devastating the economy was that st...,-1,negative
3,incospar was established 1962 under dae which ...,-1,negative
4,what shame election commission clearly working...,-1,negative
...,...,...,...
119995,chowkidar kesavan humble request the prime min...,1,positive
119996,fact iam toh saying that release the movie aft...,1,positive
119997,dhruv please make some video chowkidars they a...,1,positive
119998,now congress may say supporting bjp modi gover...,1,positive


In [7]:
data["pre_process"] = data["clean_text"].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
data["pre_process"] = data["pre_process"].apply(lambda x: re.sub(r"http\S+", "", x))

In [8]:
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s

data["pre_process"] = data["pre_process"].apply(lambda x:contractions(x))

In [9]:
data["pre_process"] = data["pre_process"].apply(lambda x: " ".join([re.sub("[^A-Za-z]+","", x) for x in nltk.word_tokenize(x)]))

In [10]:
data.head()

Unnamed: 0,clean_text,category,category_sentiment,pre_process
0,'@Cyhuntastic I wish I could have gone ',-1,negative,cyhuntastic i wish i could have gone
1,you are afraid indian tiger mrmodi didi,-1,negative,you are afraid indian tiger mrmodi didi
2,that would devastating the economy was that st...,-1,negative,that would devastating the economy was that st...
3,incospar was established 1962 under dae which ...,-1,negative,incospar was established under dae which late...
4,what shame election commission clearly working...,-1,negative,what shame election commission clearly working...


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(data["pre_process"], data["category"], test_size=0.25, random_state=30, stratify=data["category_sentiment"])
print("Train: ", (X_train.shape, Y_train.shape), 
      "Test: ", (X_test.shape, Y_test.shape))

Train:  ((90000,), (90000,)) Test:  ((30000,), (30000,))


In [24]:
print("TFIDF Vectorizer . . .")

vectorizer= TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

TFIDF Vectorizer . . .


In [25]:
# Reduce the dimensionality of the input data
svd = TruncatedSVD(n_components=2000)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)

In [26]:
# Encode the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(Y_train)
y_test = encoder.fit_transform(Y_test)

In [27]:
# One-hot encode the target variable
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [28]:
# Define the deep learning model
model = Sequential()
model.add(Dense(2000, input_shape=(2000,), activation="relu"))
model.add(Dense(1024, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(3, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [29]:
unique_classes = np.unique(y_train)
n_classes = len(unique_classes)
print("Number of classes in the target variable: ", n_classes)

Number of classes in the target variable:  2


In [30]:
last_layer = model.layers[-1]
n_neurons = last_layer.output_shape[-1]
print("Number of neurons in the last dense layer: ", n_neurons)

Number of neurons in the last dense layer:  3


In [31]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.1, callbacks=[EarlyStopping(monitor="val_loss", patience=3, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x1d0805c8d60>

In [32]:
# Make predictions on the test set
y_pred = model.predict(X_test)



In [33]:
accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 86.31%


In [34]:
sentence = "i am anyone"

test_feature = vectorizer.transform([sentence])
test_feature = svd.transform(test_feature)
model.predict(test_feature)

sentiment = model.predict(test_feature)


def rev_one_hot(x):
    l = []
    m = [l]
    for i in x:
        for j in i:
            if j < 0.5:
                j = 0
                l.append(j)
            else:
                j = 1
                l.append(j)
    return m

sentiment = rev_one_hot(sentiment)
print(sentiment)

def to_categ(pred):

    if pred == [[1, 0, 0]]:
        return "negative"
    
    elif pred == [[0, 0, 1]]:
        return "positive"

    else:
        return "neutral"

to_categ(sentiment)

[[0, 1, 0]]


'neutral'