In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
data = pd.read_csv(r"Sentiment.csv",usecols=['text', 'sentiment'])

In [3]:
data.head()

Unnamed: 0,sentiment,text
0,Neutral,RT @NancyLeeGrahn: How did everyone feel about...
1,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
2,Neutral,RT @TJMShow: No mention of Tamir Rice and the ...
3,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
4,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [4]:
data = data[data['sentiment'].isin(['Positive','Negative'])]

In [5]:
sentiment_count = data['sentiment'].value_counts()
print(sentiment_count)

Negative    8493
Positive    2236
Name: sentiment, dtype: int64


In [6]:
# preprocess text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['text'])

In [7]:
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X, maxlen=100)

In [8]:
# convert sentiment to numeric labels
y = data['sentiment'].map({'Positive': 1, 'Negative': 0}).values

In [9]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# build LSTM model
model = Sequential([
    Embedding(5000, 32, input_length=100),
    LSTM(64, dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# train model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10,batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
pred = model.predict(X)
pred



array([[9.9365479e-01],
       [3.2152265e-01],
       [9.9968946e-01],
       ...,
       [9.9466336e-01],
       [7.2232023e-04],
       [9.9974054e-01]], dtype=float32)

In [13]:
pred_mess = np.argmax(pred , axis=1)
pred_mess

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
from sklearn.metrics import confusion_matrix  ,accuracy_score

In [15]:
confusion_matrix(y , pred_mess)

array([[8493,    0],
       [2236,    0]], dtype=int64)

In [16]:
accuracy_score(y , pred_mess) *100

79.15928791126852

In [17]:
sentences = ['He is a great leader.', 'He is a terrible leader.']

# preprocess sentences
X_new = tokenizer.texts_to_sequences(sentences)
X_new = pad_sequences(X_new, maxlen=100)

# predict sentiment
y_new = model.predict(X_new)

# print predicted sentiment
for i in range(len(sentences)):
    if y_new[i] > 0.5:
        print(f"'{sentences[i]}' = is predicted to be positive.")
    else:
        print(f"'{sentences[i]}' = is predicted to be negative.")

'He is a great leader.' = is predicted to be positive.
'He is a terrible leader.' = is predicted to be negative.
