#**Case Study 4 - Machine Learning to predict public sentiment from text data.**


Look into twitter text data to predict if the given text has positive or negative sentiment
towards a particular brand. The dataset includes twitter text related to Apple and Google
products with user sentiment ranked between ‘positive’, ‘negative’, ‘neutral’ and ‘no_idea’,
sentiments. Create a simpleRNN or LSTM based classifiers to classify tweets into the four
classes. You can avoid ‘emotion_in_tweet_is_directed_at’ column.

In [1]:
import numpy as np
import pandas as pd 

In [2]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv',encoding= 'unicode_escape')
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
dt = data.drop(['emotion_in_tweet_is_directed_at'],axis=1) 

In [4]:
dt.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [5]:
dt.columns = ["Text", "Label"]
dt.head()

Unnamed: 0,Text,Label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [6]:
dt['Label'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: Label, dtype: int64

In [7]:
dt['Label']= dt['Label'].map({'No emotion toward brand or product':'neutral','Positive emotion':'positive','Negative emotion':'negative',"I can't tell":'no idea'})

In [8]:
dt['Label'] = dt['Label'].map({'neutral':2,'positive':3,'negative':1,'no idea':0})

In [9]:
dt.head()

Unnamed: 0,Text,Label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,3
2,@swonderlin Can not wait for #iPad 2 also. The...,3
3,@sxsw I hope this year's festival isn't as cra...,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,3


In [10]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9092 non-null   object
 1   Label   9093 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 142.2+ KB


In [11]:
dt = dt.dropna()

In [12]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9092 non-null   object
 1   Label   9092 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 213.1+ KB


In [13]:
import gensim
dt['text_clean_gensim'] = dt['Text'].apply(lambda x: gensim.utils.simple_preprocess(x))
dt.head()

Unnamed: 0,Text,Label,text_clean_gensim
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,1,"[wesley, have, iphone, after, hrs, tweeting, a..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,3,"[jessedee, know, about, fludapp, awesome, ipad..."
2,@swonderlin Can not wait for #iPad 2 also. The...,3,"[swonderlin, can, not, wait, for, ipad, also, ..."
3,@sxsw I hope this year's festival isn't as cra...,1,"[sxsw, hope, this, year, festival, isn, as, cr..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,3,"[sxtxstate, great, stuff, on, fri, sxsw, maris..."


**Fitting RNN model**


In [14]:
from keras.preprocessing import text
tokenizer = text.Tokenizer() 
tokenizer.fit_on_texts(list(dt['text_clean_gensim']))
tokenized_texts = tokenizer.texts_to_sequences(dt['text_clean_gensim'])

In [15]:
from keras.utils import pad_sequences
X = pad_sequences(tokenized_texts, maxlen=100)

In [16]:
len(tokenizer.word_index)

9283

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, dt['Label'].values, test_size=0.2)

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout

In [19]:
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 128,input_length=100))
model.add(SimpleRNN(10))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax')) 

In [20]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1188352   
                                                                 
 simple_rnn (SimpleRNN)      (None, 10)                1390      
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 50)                550       
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 4)                 204       
                                                                 
Total params: 1,190,496
Trainable params: 1,190,496
Non-

In [22]:
model.fit(X_train, y_train, epochs=15, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fa79fb91730>

In [29]:
y_pred = model.predict(X_test)



**Testing**


In [34]:
#positive
test1 = "Its a good news"
test1 = tokenizer.texts_to_sequences([test1])
test2 = pad_sequences(test1, maxlen=100)
out = model.predict(test2)
out



array([[4.4656990e-06, 3.1699965e-06, 9.2005813e-01, 7.9934202e-02]],
      dtype=float32)

In [35]:
#negative
test1 = "Its a bad news"
test1 = tokenizer.texts_to_sequences([test1])
test2 = pad_sequences(test1, maxlen=100)
out = model.predict(test2)
out



array([[1.02930635e-05, 3.64335028e-06, 9.72695410e-01, 2.72906628e-02]],
      dtype=float32)

In [36]:
#neutral
test1 = "The product is ok I guess"
test1 = tokenizer.texts_to_sequences([test1])
test2 = pad_sequences(test1, maxlen=100)
out = model.predict(test2)
out



array([[2.12059429e-04, 4.22001263e-04, 1.05703235e-01, 8.93662691e-01]],
      dtype=float32)