In [128]:
import pandas as pd
import re
import random
import pickle
import contractions
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [129]:
df = pd.read_csv("Twitter.csv")
df

Unnamed: 0,clean_text,category,category_sentiment
0,when modi promised “minimum government maximum...,-1,negative
1,talk all the nonsense and continue all the dra...,0,neutral
2,what did just say vote for modi welcome bjp t...,1,positive
3,asking his supporters prefix chowkidar their n...,1,positive
4,answer who among these the most powerful world...,1,positive
...,...,...,...
177981,'I'm not satisfied with The Hills finale. gon...,-1,negative
177982,this sucks,-1,negative
177983,this is bad,-1,negative
177984,I am not okay with this,-1,negative


In [130]:
df["category"].value_counts()

 1    72254
 0    62713
-1    43019
Name: category, dtype: int64

In [131]:
def text_transformation(text):
    text = " ".join(x.lower() for x in str(text).split())                             # Converting Text to Lowercase
    text = contractions.fix(text)                                                     # Fixes Contractions such as ("you're" to "you are" etc.)
    text = " ".join([re.sub("[^A-Za-z]+", "", x) for x in word_tokenize(text)])       # Removal of Punctuation, Numbers, and Special Characters                                                               
    return text

In [132]:
df["processed_text"] = df["clean_text"].apply(text_transformation)
df

Unnamed: 0,clean_text,category,category_sentiment,processed_text
0,when modi promised “minimum government maximum...,-1,negative,when modi promised minimum government maximum...
1,talk all the nonsense and continue all the dra...,0,neutral,talk all the nonsense and continue all the dra...
2,what did just say vote for modi welcome bjp t...,1,positive,what did just say vote for modi welcome bjp to...
3,asking his supporters prefix chowkidar their n...,1,positive,asking his supporters prefix chowkidar their n...
4,answer who among these the most powerful world...,1,positive,answer who among these the most powerful world...
...,...,...,...,...
177981,'I'm not satisfied with The Hills finale. gon...,-1,negative,i am not satisfied with the hills finale goi...
177982,this sucks,-1,negative,this sucks
177983,this is bad,-1,negative,this is bad
177984,I am not okay with this,-1,negative,i am not okay with this


In [133]:
df = df[["processed_text","category_sentiment"]]
df

Unnamed: 0,processed_text,category_sentiment
0,when modi promised minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp to...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive
...,...,...
177981,i am not satisfied with the hills finale goi...,negative
177982,this sucks,negative
177983,this is bad,negative
177984,i am not okay with this,negative


In [219]:
# x_train, x_test = train_test_split(df, test_size=.2, random_state=42)

x_train, x_test, y_train, y_test = train_test_split(df["processed_text"], df["category_sentiment"], test_size=0.2, random_state=42, stratify=y_strat)

x_train

148246                                jai hind modi bjp bjp
30346     were not you guys also waiting for this politi...
145628    admiration for modi conditional there good opt...
15688                   modi  three foreign policy wins via
15986     industry sources tell the films badly made tha...
                                ...                        
137311    not modis someone elses indian india and the g...
34703     were any interview conducted when modi governm...
28184     how can not see what they have turned intodeci...
114312    argument save india for the next years from bj...
24550           that too tide changed after goa speech modi
Name: processed_text, Length: 142388, dtype: object

In [220]:
# Keras
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train

148246                                jai hind modi bjp bjp
30346     were not you guys also waiting for this politi...
145628    admiration for modi conditional there good opt...
15688                   modi  three foreign policy wins via
15986     industry sources tell the films badly made tha...
                                ...                        
137311    not modis someone elses indian india and the g...
34703     were any interview conducted when modi governm...
28184     how can not see what they have turned intodeci...
114312    argument save india for the next years from bj...
24550           that too tide changed after goa speech modi
Name: processed_text, Length: 142388, dtype: object

In [221]:
vectorizer = TfidfVectorizer(sublinear_tf=True)

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

type(x_train)


scipy.sparse._csr.csr_matrix

In [223]:
x_train

<142388x97885 sparse matrix of type '<class 'numpy.float64'>'
	with 2531435 stored elements in Compressed Sparse Row format>

In [224]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df.category_sentiment.tolist())

y_train = encoder.transform(df.category_sentiment.tolist())
y_test = encoder.transform(df.category_sentiment.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (177986, 1)
y_test (177986, 1)


In [227]:
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM


vocab_size = len(tokenizer.word_index) + 1
embedding_layer = Embedding(vocab_size, 50)

In [228]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(embedding_layer)
model.add(layers.SimpleRNN(50,return_sequences=True))
model.add(layers.SimpleRNN(50))
model.add(layers.Dense(1, activation='softmax'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 50)          4895600   
                                                                 
 simple_rnn_14 (SimpleRNN)   (None, None, 50)          5050      
                                                                 
 simple_rnn_15 (SimpleRNN)   (None, 50)                5050      
                                                                 
 dense_7 (Dense)             (None, 1)                 51        
                                                                 
Total params: 4,905,751
Trainable params: 4,905,751
Non-trainable params: 0
_________________________________________________________________


In [229]:
model.compile(loss='binary_crossentropy',
              optimizer="rmsprop",
              metrics=['accuracy'])

In [230]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [231]:
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=5,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

ValueError: `validation_split` is only supported for Tensors or NumPy arrays, found following types in the input: [<class 'scipy.sparse._csr.csr_matrix'>]