# <center style="font-size: 25pt; color: green;">Tweets Sentiment analysis Dataset</center>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

# <span style="font-size: 15pt; color: blue;">  Loading and describing dataset </span>

In [2]:
df = pd.read_csv('Tweets.csv')

In [3]:
df.shape[0]

27481

In [4]:
df['sentiment'].nunique()

3

In [5]:
print(df.head())

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [6]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset has 27481 rows and 4 columns.


In [7]:
print(df['sentiment'].value_counts())

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64


In [8]:
#df['num_words'] = df['text'].apply(lambda x: len(x.split()))
#print(f'Average number of words per text entry: {df["num_words"].mean()}')

In [9]:
print(f"Number of missing or NA values for each column:\n{df.isnull().sum()}")

Number of missing or NA values for each column:
textID           0
text             1
selected_text    1
sentiment        0
dtype: int64


# <span style="font-size: 15pt; color: blue;">  Building model </span>

In [10]:
df = df.dropna()
df = df[['text', 'sentiment']]
df = df[df.sentiment != "neutral"]
df['b_sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [11]:
#Augmentation
import nlpaug.augmenter.word as naw
substitute_augmenter = naw.RandomWordAug(action="substitute")

augmented_texts = []
for text in df['text']:
    augmented_text = substitute_augmenter.augment(text)
    augmented_texts.append(augmented_text)

augmented_df = pd.DataFrame({'text': augmented_texts, 'sentiment': df['sentiment']})
df = pd.concat([df, augmented_df], ignore_index=True)

In [12]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['text'])
list_tokenized_train = tokenizer.texts_to_sequences(df['text'])

In [13]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df['b_sentiment']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size=0.2, random_state=0)

In [15]:
embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# <span style="font-size: 15pt; color: blue;">  Training </span>

In [17]:
batch_size = 100
epochs = 5
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/5


  return t[start:end]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ca36901e50>

In [18]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Test Accuracy: ", accuracy)

Test Accuracy:  0.8707607984542847


# <span style="font-size: 15pt; color: blue;">  Testing </span>

In [19]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    text = text.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    text = ' '.join(tokens)
    
    return text

In [20]:
def predict_sentiment(model, sentence):
    sentence = preprocess_text(sentence)
    
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    
    prediction = model.predict(padded_sequence)
    
    sentiment = 'neutral'
    if prediction < 0.33:
        sentiment = 'negative'
    elif prediction > 0.66:
        sentiment = 'positive'

    return sentiment

In [21]:
sentence = "dvj is the best asso"
sentiment = predict_sentiment(model, sentence)
print(f"The sentiment of the sentence '{sentence}' is {sentiment}.")

The sentiment of the sentence 'dvj is the best asso' is positive.
