In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
# nltk.download('punkt')
import re
from bs4 import BeautifulSoup
from sklearn.decomposition import TruncatedSVD
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Twitter.csv')
df.head()

Unnamed: 0,clean_text,category,category_sentiment
0,when modi promised “minimum government maximum...,-1,negative
1,talk all the nonsense and continue all the dra...,0,neutral
2,what did just say vote for modi welcome bjp t...,1,positive
3,asking his supporters prefix chowkidar their n...,1,positive
4,answer who among these the most powerful world...,1,positive


In [3]:
df=df.dropna()
df['category'] = df['category'].astype(int)
df = df.reset_index(drop=True)
df['category'].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [4]:
# df=df[df['category']!=0]
df['category'].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [5]:
df = df.sample(frac=1).reset_index(drop=True) #shuffling of tweets
data=df[df['category']==-1][:43019]
data=data.append(df[df['category']==0][:43019])
data=data.append(df[df['category']==1][:43019])
data = data.reset_index(drop=True)
display(data['category'].value_counts())
data

  data=data.append(df[df['category']==0][:43019])
  data=data.append(df[df['category']==1][:43019])


-1    43019
 0    43019
 1    43019
Name: category, dtype: int64

Unnamed: 0,clean_text,category,category_sentiment
0,all the political parties are cheats you call ...,-1,negative
1,the technology shoot down satellite orbit some...,-1,negative
2,will ensure family friends relatives vote bein...,-1,negative
3,unfortunate that names mahatma gandhi nehru an...,-1,negative
4,earlier people asked who modi now they know wh...,-1,negative
...,...,...,...
129052,khan itself enough guarantee below average bra...,1,positive
129053,well done drdo happy theatre day modi rahul ga...,1,positive
129054,corruption reduced govt opened bank accounts o...,1,positive
129055,modi breathes bhupendra chaubhe sir this brill...,1,positive


In [6]:
data['pre_process'] = data['clean_text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r"http\S+", "", x))



In [7]:
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s
data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))

In [8]:
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

In [9]:
# from nltk.corpus import stopwords
# stop = stopwords.words('english')
# data['pre_process']=data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [10]:
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# data['pre_process']=data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [11]:
data.head()

Unnamed: 0,clean_text,category,category_sentiment,pre_process
0,all the political parties are cheats you call ...,-1,negative,all the political parties are cheats you call ...
1,the technology shoot down satellite orbit some...,-1,negative,the technology shoot down satellite orbit some...
2,will ensure family friends relatives vote bein...,-1,negative,will ensure family friends relatives vote bein...
3,unfortunate that names mahatma gandhi nehru an...,-1,negative,unfortunate that names mahatma gandhi nehru an...
4,earlier people asked who modi now they know wh...,-1,negative,earlier people asked who modi now they know wh...


In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(data['pre_process'], data['category'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train:  (96792,) (96792,) Test:  ((32265,), (32265,))


In [13]:
print("TFIDF Vectorizer……")
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

TFIDF Vectorizer……


In [14]:
# Reduce the dimensionality of the input data
svd = TruncatedSVD(n_components=2000)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)

In [15]:
# Encode the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(Y_train)
y_test = encoder.fit_transform(Y_test)

In [16]:
# One-hot encode the target variable
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [17]:
# Define the deep learning model
model = Sequential()
model.add(Dense(2000, input_shape=(2000,), activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
unique_classes = np.unique(y_train)
n_classes = len(unique_classes)
print("Number of classes in the target variable: ", n_classes)

Number of classes in the target variable:  2


In [19]:
last_layer = model.layers[-1]
n_neurons = last_layer.output_shape[-1]
print("Number of neurons in the last dense layer: ", n_neurons)

Number of neurons in the last dense layer:  3


In [20]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x1fd2d34cdc0>

In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)



In [22]:
# Evaluate the model
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 86.20%


In [23]:
def Negation(sentence):	
    '''
    Input: Tokenized sentence (List of words)
    Output: Tokenized sentence with negation handled (List of words)
    '''
    temp = int(0)
    for i in range(len(sentence)):
        if sentence[i-1] in ['not',"n't"]:
            antonyms = []
            for syn in wordnet.synsets(sentence[i]):
                syns = wordnet.synsets(sentence[i])
                w1 = syns[0].name()
                temp = 0
                for l in syn.lemmas():
                    if l.antonyms():
                        antonyms.append(l.antonyms()[0].name())
                max_dissimilarity = 0
                for ant in antonyms:
                    syns = wordnet.synsets(ant)
                    w2 = syns[0].name()
                    syns = wordnet.synsets(sentence[i])
                    w1 = syns[0].name()
                    word1 = wordnet.synset(w1)
                    word2 = wordnet.synset(w2)
                    if isinstance(word1.wup_similarity(word2), float) or isinstance(word1.wup_similarity(word2), int):
                        temp = 1 - word1.wup_similarity(word2)
                    if temp>max_dissimilarity:
                        max_dissimilarity = temp
                        antonym_max = ant
                        sentence[i] = antonym_max
                        sentence[i-1] = ''
    while '' in sentence:
        sentence.remove('')
    sentence = ' '.join(sentence)
    return sentence

In [24]:
# Input your own tweet
tweet = "I like apples."

# Pre-process the tweet
tweet = BeautifulSoup(tweet).get_text()
tweet = contractions(tweet)
tweet = Negation(nltk.word_tokenize(tweet))

print(tweet)
# Vectorize the tweet using Tf-idf
tweet_vector = vectorizer.transform([tweet])
tweet_vector = tf.sparse.from_dense(tweet_vector.todense())
tweet_vector = tf.sparse.reorder(tweet_vector)
# print(tweet_vector.get_shape())
tweet_vector = svd.fit_transform(tweet_vector)

# Predict the sentiment of the tweet
sentiment = model.predict(tweet_vector)

# Print the sentiment
if sentiment == -1:
    print("Negative")
elif sentiment == 0:
    print("Neutral")
else:
    print("Positive")

I like apples .


ValueError: Expected 2D array, got scalar array instead:
array=SparseTensor(indices=tf.Tensor(
[[    0  3778]
 [    0 40035]], shape=(2, 2), dtype=int64), values=tf.Tensor([0.93426902 0.35656892], shape=(2,), dtype=float64), dense_shape=tf.Tensor([    1 78207], shape=(2,), dtype=int64)).
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.