<a href="https://colab.research.google.com/github/adsamardeep/Twitter-Sentiment-Analysis/blob/master/Full_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
from termcolor import colored
from sklearn.model_selection import train_test_split

# Define variables
COLUMNS = ['id', 'original_text',	'lang',	'retweet_count',	'original_author',	'sentiment_class']

# Read dataset
dataset = pd.read_csv('data/dataset.csv', encoding = 'latin-1')
print(colored("Columns: {}".format(', '.join(COLUMNS)), "yellow"))

# Remove extra columns
print(colored("Useful columns: sentiment_class and original_text", "yellow"))
print(colored("Removing other columns", "red"))
dataset.drop(['id',	'lang',	'retweet_count',	'original_author'], axis = 1, inplace = True)
print(colored("Columns removed", "red"))

# Train test split
print(colored("Splitting train and test dataset into 80:20", "yellow"))
X_train, X_test, y_train, y_test = train_test_split(dataset['original_text'], dataset['sentiment_class'], test_size = 0.20, random_state = 100)
train_dataset = pd.DataFrame({
	'Tweet': X_train,
	'Sentiment': y_train
	})
print(colored("Train data distribution:", "yellow"))
print(train_dataset['Sentiment'].value_counts())
test_dataset = pd.DataFrame({
	'Tweet': X_test,
	'Sentiment': y_test
	})
print(colored("Test data distribution:", "yellow"))
print(test_dataset['Sentiment'].value_counts())
print(colored("Split complete", "yellow"))

# Save train data
print(colored("Saving train data", "yellow"))

train_dataset.to_csv('data/train.csv', index = False)
print(colored("Train data saved to data/train.csv", "green"))

# Save test data
print(colored("Saving test data", "yellow"))
test_dataset.to_csv('data/test.csv', index = False)
print(colored("Test data saved to data/test.csv", "green"))

[33mColumns: id, original_text, lang, retweet_count, original_author, sentiment_class[0m
[33mUseful columns: sentiment_class and original_text[0m
[31mRemoving other columns[0m
[31mColumns removed[0m
[33mSplitting train and test dataset into 80:20[0m
[33mTrain data distribution:[0m
 0    1371
 1     618
-1     599
Name: Sentiment, dtype: int64
[33mTest data distribution:[0m
 0    330
-1    170
 1    147
Name: Sentiment, dtype: int64
[33mSplit complete[0m
[33mSaving train data[0m
[32mTrain data saved to data/train.csv[0m
[33mSaving test data[0m
[32mTest data saved to data/test.csv[0m


In [26]:
import re
import nltk
import numpy as np
import pandas as pd
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from termcolor import colored
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Import datasets
print("Loading data")
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Setting stopwords
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove("not")

# Function to expand tweet
def expand_tweet(tweet):
	expanded_tweet = []
	for word in tweet:
		if re.search("n't", word):
			expanded_tweet.append(word.split("n't")[0])
			expanded_tweet.append("not")
		else:
			expanded_tweet.append(word)
	return expanded_tweet

# Function to process tweets
def clean_tweet(data, wordNetLemmatizer, porterStemmer):
	data['Clean_tweet'] = data['Tweet']
	print(colored("Removing user handles starting with @", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].str.replace("@[\w]*","")
	print(colored("Removing numbers and special characters", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].str.replace("[^a-zA-Z' ]","")
	print(colored("Removing urls", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(r"((www\.[^\s]+)|(https?://[^\s]+))"), "")
	print(colored("Removing single characters", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(r"(^| ).( |$)"), " ")
	print(colored("Tokenizing", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].str.split()
	print(colored("Removing stopwords", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [word for word in tweet if word not in STOPWORDS])
	print(colored("Expanding not words", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: expand_tweet(tweet))
	print(colored("Lemmatizing the words", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [wordNetLemmatizer.lemmatize(word) for word in tweet])
	print(colored("Stemming the words", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [porterStemmer.stem(word) for word in tweet])
	print(colored("Combining words back to tweets", "yellow"))
	data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: ' '.join(tweet))
	return data

# Define processing methods
wordNetLemmatizer = WordNetLemmatizer()
porterStemmer = PorterStemmer()

# Pre-processing the tweets
print(colored("Processing train data", "green"))
train_data = clean_tweet(train_data, wordNetLemmatizer, porterStemmer)
train_data.to_csv('data/clean_train.csv', index = False)
print(colored("Train data processed and saved to data/clean_train.csv", "green"))
print(colored("Processing test data", "green"))
test_data = clean_tweet(test_data, wordNetLemmatizer, porterStemmer)
test_data.to_csv('data/clean_test.csv', index = False)
print(colored("Test data processed and saved to data/clean_test.csv", "green"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Loading data
[32mProcessing train data[0m
[33mRemoving user handles starting with @[0m
[33mRemoving numbers and special characters[0m
[33mRemoving urls[0m
[33mRemoving single characters[0m
[33mTokenizing[0m
[33mRemoving stopwords[0m
[33mExpanding not words[0m
[33mLemmatizing the words[0m
[33mStemming the words[0m
[33mCombining words back to tweets[0m
[32mTrain data processed and saved to data/clean_train.csv[0m
[32mProcessing test data[0m
[33mRemoving user handles starting with @[0m
[33mRemoving numbers and special characters[0m
[33mRemoving urls[0m
[33mRemoving single characters[0m
[33mTokenizing[0m
[33mRemoving stopwords[0m
[33mExpanding not words[0m
[33mLemmatizing the words[0m
[33mStemming the words[0m
[33mC

In [27]:
import os
import tensorflow
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
from termcolor import colored

# Load data
print(colored("Loading train and test data", "yellow"))
train_data = pd.read_csv('data/clean_train.csv')
test_data = pd.read_csv('data/clean_test.csv')
print(colored("Data loaded", "yellow"))

# Tokenization
print(colored("Tokenizing and padding data", "yellow"))
tokenizer = Tokenizer(num_words = 2000, split = ' ')
tokenizer.fit_on_texts(train_data['Clean_tweet'].astype(str).values)
train_tweets = tokenizer.texts_to_sequences(train_data['Clean_tweet'].astype(str).values)
max_len = max([len(i) for i in train_tweets])
train_tweets = pad_sequences(train_tweets, maxlen = max_len)
test_tweets = tokenizer.texts_to_sequences(test_data['Clean_tweet'].astype(str).values)
test_tweets = pad_sequences(test_tweets, maxlen = max_len)
print(colored("Tokenizing and padding complete", "yellow"))

# Building the model
print(colored("Creating the LSTM model", "yellow"))
model = Sequential()
model.add(Embedding(2000, 128, input_length = train_tweets.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout = 0.2))
model.add(Dense(3, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

# Training the model
print(colored("Training the LSTM model", "green"))
history = model.fit(train_tweets, pd.get_dummies(train_data['Sentiment']).values, epochs = 10, batch_size = 128, validation_split = 0.2)
print(colored(history, "green"))

# Testing the model
print(colored("Testing the LSTM model", "green"))
score, accuracy = model.evaluate(test_tweets, pd.get_dummies(test_data['Sentiment']).values, batch_size = 128)
print("Test accuracy: {}".format(accuracy))

[33mLoading train and test data[0m
[33mData loaded[0m
[33mTokenizing and padding data[0m
[33mTokenizing and padding complete[0m
[33mCreating the LSTM model[0m
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 128)           256000    
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 50, 128)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 771       
Total params: 651,011
Trainable params: 651,011
Non-trainable params: 0
_________________________________________________________________
[32mTraining the LSTM model[0m


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2070 samples, validate on 518 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[32m<keras.callbacks.callbacks.History object at 0x7faeaa484ef0>[0m
[32mTesting the LSTM model[0m
Test accuracy: 0.3956723213195801


In [28]:
import numpy as np
import pandas as pd
from termcolor import colored
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
print(colored("Loading train and test data", "yellow"))
train_data = pd.read_csv('data/clean_train.csv')
test_data = pd.read_csv('data/clean_test.csv')
print(colored("Data loaded", "yellow"))

# Tf-IDF
print(colored("Applying TF-IDF transformation", "yellow"))
tfidfVectorizer = TfidfVectorizer(min_df = 5, max_features = 1000)
tfidfVectorizer.fit(train_data['Clean_tweet'].apply(lambda x: np.str_(x)))
train_tweet_vector = tfidfVectorizer.transform(train_data['Clean_tweet'].apply(lambda x: np.str_(x)))
test_tweet_vector = tfidfVectorizer.transform(test_data['Clean_tweet'].apply(lambda x: np.str_(x)))

# Training
print(colored("Training Random Forest Classifier", "yellow"))
randomForestClassifier = RandomForestClassifier()
randomForestClassifier.fit(train_tweet_vector, train_data['Sentiment'])

# Prediction
print(colored("Predicting on train data", "yellow"))
prediction = randomForestClassifier.predict(train_tweet_vector)
print(colored("Training accuracy: {}%".format(accuracy_score(train_data['Sentiment'], prediction)*100), "green"))

print(colored("Predicting on test data", "yellow"))
prediction = randomForestClassifier.predict(test_tweet_vector)
print(colored("Testing accuracy: {}%".format(accuracy_score(test_data['Sentiment'], prediction)*100), "green"))

[33mLoading train and test data[0m
[33mData loaded[0m
[33mApplying TF-IDF transformation[0m
[33mTraining Random Forest Classifier[0m
[33mPredicting on train data[0m
[32mTraining accuracy: 99.34312210200927%[0m
[33mPredicting on test data[0m
[32mTesting accuracy: 49.92272024729521%[0m
