In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#import nltk
#from nltk.corpus import stopwords

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras import optimizers

Using TensorFlow backend.


In [3]:
"""Pass the dataset with data in the format: class:content"""
"""Text pre-processing: Removing links, special characters, and digits. df_column[1] is also converted into lower case"""
def preprocess_dataset(df, df_column):
    df[df_column[1]] = df[df_column[1]].str.replace('https?:\/\/.*[\r\n]*', ' ')

    df[df_column[1]] = df[df_column[1]].str.replace('[^a-zA-Z0-9 \n]', ' ')

    df[df_column[1]] = df[df_column[1]].str.replace('\d+', ' ')

    df[df_column[1]] = df[df_column[1]].str.lower()
    return df

In [4]:
"""To label the sentiment classes using integers. Not to be used for the neural network"""
def to_categorical(df):
    df.sentiment = pd.Categorical(df.sentiment)
    df['class'] = df.sentiment.cat.codes
    return df['class']

In [5]:
"""Function returns the one-hot representation of the sentiment classes"""
def to_OneHot(df, df_columns):
    b = pd.get_dummies(df[df_column[0]], prefix="")
    list1 = list(b)
    OneHot = b[list1[0]]
    OneHot = np.column_stack(b[list1[i]] for i in range(len(list1)))
    print(len(list1))
    print(OneHot)
    return OneHot

In [6]:
  """Labels can be either to_OneHot function return value or to_categorical function return value"""
def split_train_test(df, df_column, labels, test_split = 0.2):
    X_train, X_test, y_train, y_test = train_test_split(df[df_column[1]], labels, test_size = test_split, random_state = 10)
    return X_train, X_test, y_train, y_test

# CountVectorizer(BOW)

In [7]:
"""Representing the pre-processed dataset as a bag of words"""
def to_bow(X_train, X_test, ngram_lower, ngram_upper, to_lower):
    vect = CountVectorizer(ngram_range=(ngram_lower,ngram_upper), stop_words='english', lowercase=to_lower)
    vect.fit_transform(X_train)
    return vect.transform(X_train), vect.transform(X_test)  #"""X_train_cv, etc"""

# LDA

In [8]:
"""y_train must be in categorically labelled form"""
def eval_LDA(X_train, X_test, y_train, n_comp=2):
    lda = LinearDiscriminantAnalysis(n_components=2)
    lda.fit_transform(X_train.toarray(), y_train)
    X_train_df = lda.transform(X_train)
    X_test_df = lda.transform(X_test)

# TruncatedSVD

###### Maxm value of reduced feature set ~ 700 

In [9]:
"""PCA won't work here"""
def trunSVD(X_train_transform, X_test_transform, nf = 784, n_iterations = 5 ):
    svd = TruncatedSVD(n_components=nf, n_iter = n_iterations)
    svd.fit_transform(X_train_transform)
    return svd.transform(X_train_transform), svd.transform(X_test_transform)

# TFIDF

In [10]:
def tfidf(X_train, X_test, ngram_lower = 1, ngram_upper = 1):
    tfidfvect = TfidfVectorizer(ngram_range=(ngram_lower,ngram_upper), stop_words='english')
    tfidfvect.fit_transform(X_train)
    return tfidfvect.transform(X_train), tfidfvect.transform(X_test)

# Model

In [36]:
def model_train(X_train, y_train, batch_size = 15, nb_epoch = 15 ,optimizer = 'adam', loss = 'categorical_crossentropy', activation='tanh', hidden_neurons = 30):
    model = Sequential()
    model.add(Dense(units = 784,input_dim=X_train.shape[1], activation = activation))
    model.add(Dense(hidden_neurons, activation = activation))
    model.add(Dense(classes, activation = 'softmax'))
    model.compile(optimizer = optimizer, loss= loss, metrics = ['accuracy'])
    model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1)
    return model

In [34]:
def eval_model(X_test, y_test, model):
    scores = model.evaluate(X_test, y_test)
    print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# Testing

In [11]:
df = pd.read_csv('./Datasets/Twitter Sentiment Analysis/train_data.csv')
df_column = list(df)      #Names of the columns of the dataframe
classes = df[df_column[0]].unique().size #Number of distinct classes for the dataset. 13 for the given dataset
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [12]:
df = preprocess_dataset(df, df_column)

In [13]:
X_train, X_test, y_train, y_test = split_train_test(df, df_column, to_OneHot(df, df_column))

13
[[0 0 1 ..., 0 0 0]
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 1 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [14]:
X_train, X_test = tfidf(X_train, X_test, 1, 1)

In [15]:
# X_train, X_test = eval_LDA(X_train.toarray(), X_test.toarray(), y_train, 2)

In [16]:
X_train, X_test = trunSVD(X_train, X_test)

In [38]:
model = model_train(X_train, y_train)



Epoch 1/1


In [39]:
eval_model(X_test, y_test, model)

acc: 33.87%
