In [50]:
import numpy as np
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split

In [51]:
dataset = pd.read_csv('Sentiment.csv')

In [52]:
dataset.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [53]:
dataset = dataset[['text','sentiment']]
dataset = dataset[dataset.sentiment != "Neutral"]

In [54]:
dataset.head()

Unnamed: 0,text,sentiment
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative


In [58]:
dataset.shape

(10729, 2)

In [61]:
dataset['text'] = dataset['text'].apply(lambda x: x.lower())
dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'[\W_]+', ' ', x))
dataset['text'] = dataset['text'].apply(lambda x: re.sub('rt','',x))
dataset['text'] = dataset['text'].apply(lambda x: x.lstrip())

In [62]:
dataset.head()

Unnamed: 0,text,sentiment
1,scottwalker didn t catch the full gopdebate la...,Positive
3,robgeorge that carly fiorina is tnding hours a...,Positive
4,danscavino gopdebate w aldonaldtrump delived t...,Positive
5,ggabbott tx tedcruz on my first day i will sci...,Positive
6,warriorwoman91 i liked her and was happy when ...,Negative


In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [93]:
max_words = 2000

In [94]:
tfidf = TfidfVectorizer(max_features = max_words ,analyzer = 'word', ngram_range=(1, 1))

In [95]:
tfidf.fit(dataset['text'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [96]:
tfidf.vocabulary_

{'scottwalker': 1527,
 'didn': 515,
 'the': 1742,
 'full': 738,
 'gopdebate': 786,
 'last': 1021,
 'night': 1240,
 'he': 833,
 'some': 1607,
 'of': 1276,
 'scott': 1526,
 'best': 201,
 'lines': 1066,
 'in': 921,
 '90': 25,
 'seconds': 1532,
 'walker16': 1891,
 'http': 887,
 'co': 385,
 'that': 1740,
 'carly': 314,
 'fiorina': 700,
 'is': 948,
 'hours': 880,
 'after': 55,
 'her': 846,
 'debate': 476,
 'any': 113,
 'men': 1151,
 'just': 985,
 'says': 1519,
 'she': 1565,
 'on': 1292,
 'danscavino': 464,
 'aldonaldtrump': 71,
 'delived': 494,
 'highest': 852,
 'ratings': 1473,
 'history': 863,
 'psidential': 1432,
 'debates': 478,
 'trump2016': 1807,
 'tedcruz': 1721,
 'my': 1214,
 'first': 701,
 'day': 467,
 'will': 1935,
 'every': 620,
 'illegal': 908,
 'executive': 633,
 'action': 39,
 'taken': 1701,
 'by': 283,
 'obama': 1270,
 'foxnews': 724,
 'liked': 1060,
 'and': 100,
 'was': 1902,
 'happy': 823,
 'when': 1921,
 'heard': 839,
 'going': 770,
 'to': 1777,
 'be': 172,
 'moderator': 11

In [97]:
tfidf.idf_

array([7.44758549, 5.71645064, 7.57274863, ..., 8.08357426, 8.20135729,
       7.06192301])

In [100]:
x = tfidf.transform(dataset['text'].values)

In [102]:
x

<10729x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 148014 stored elements in Compressed Sparse Row format>

In [105]:
y = np.array(dataset['sentiment'])

In [106]:
y

array(['Positive', 'Positive', 'Positive', ..., 'Positive', 'Negative',
       'Positive'], dtype=object)

# Data preprocessing is complete

In [107]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

# Dataset has been split in training set and test set

# Time to build a NN using RNN and LSTM