initial commit

andbis committed Jul 13, 2019
0 parents commit 66c28053ebd057a18c45b03a2df4b08d129cc23a
@@ -0,0 +1,264 @@
from collections import defaultdict
import gensim
import matplotlib.pyplot as plt
import sys
import os
import word2vecReader
import pandas as pd
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC

def pre_processing(c, text, labels, max_features=None, lemma=False, not_neural=True):
if lemma:
text = lemmatization(text)

if c == 1:
#unigram feature
print('Vectorizing unigram')
vectorizer_uni = CountVectorizer(ngram_range=(1,1), binary=True, max_features=max_features).fit(text)
X = vectorizer_uni.transform(text).toarray()

elif c == 2:
#bigram feature
print('Vectorizing bigram')
vectorizer_bi = CountVectorizer(ngram_range=(2,2), binary=True, max_features=max_features).fit(text)
X = vectorizer_bi.transform(text).toarray()

elif c == 3:
#combined unigram + bigram array
vectorizer_uni = CountVectorizer(ngram_range=(1,1), binary=True, max_features=max_features).fit(text)
vectorizer_bi = CountVectorizer(ngram_range=(2,2), binary=True, max_features=max_features).fit(text)
X = FeatureUnion([("unigram", vectorizer_uni), ("bigram", vectorizer_bi)]).transform(text).toarray()

elif c == 4:
#self training - embedding
print('Embedding data with self_embedding')
sentences = [a for a in text]
model = gensim.models.Word2Vec(sentences, size=200,iter=30)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}
if not_neural == False:
return w2v
print('Vectorizing text')
vectorizer = TfidfEmbeddingVectorizer(w2v), labels)
X = vectorizer.transform(text)

elif c == 5:
#pre-trained word embedding GloVe twitter 200d
print('Embedding data with pre-trained "glove.twitter.27B.200d", this can take some time...')
with open("data/glove.twitter.27B.200d.txt", "r") as lines:
w2v = {line.split()[0]: np.array(line.split()[1:], dtype='float32') for line in lines}
if not_neural == False:
return w2v
print('Vectorizing text')
vectorizer = TfidfEmbeddingVectorizer(w2v), labels)
X = vectorizer.transform(text)

elif c == 6:
#Pre-traned word embedding twitter data
print('Embedding data with pre-trained 400mill tweets')
embed_space = word2vecReader.twitter_embedding() #word2vecreader is from:
w2v = {w: vec for w, vec in zip(embed_space.vocab, embed_space.syn0)}
if not_neural == False:
return w2v
print('Vectorizing text')
vectorizer = TfidfEmbeddingVectorizer(w2v), labels)
X = vectorizer.transform(text)
else: raise TypeError('Wrong key input for feature_set choice: %d; 1: unigram, 2: bigram, 3: combined, 4: Self-embedding, 5: glove embedding, 6: twitter embedding')

#split test train
print('Splitting train_test_split')
xtrain, xtest, ytrain, ytest = train_test_split(X, labels, test_size=0.2, random_state=442)
run_data = [xtrain, xtest, ytrain, ytest]
print('Run data has %d dimensions' % xtrain.shape[1])
return run_data

def algo_run(a, c, xtrain, xtest, ytrain, ytest):
feature_sets = ['Unigram', 'Bigram', 'Combined uni-bigram', 'Self-embedding', 'Glove-twitter-embedding', 'twitter_embedding']
models = [LogisticRegression, LinearSVC, MultinomialNB]
model = models[a]
c_feature_set = feature_sets[c-1]
if a == 2 and c >= 4:
print('Fitting Gaussian Naive Bayes for word-embedding')
model = GaussianNB

print('Starting %s fit of %s feature' % (model, c_feature_set))
algorithm = model().fit(xtrain, ytrain)
print('Accuracy:', algorithm.score(xtest, ytest))
predicted = algorithm.predict(xtest)
print('Classification report:\n',classification_report(ytest, predicted))
print('Confusion matrix:\n', confusion_matrix(ytest, predicted))

def neural(w2v, text, labels):
first = list(w2v.keys())[0]
EMBEDDING_DIM = len(w2v[first])

from keras import utils
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dense, Input, GlobalMaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model

classes = ['negative', 'neutral', 'positive']
labels = [classes.index(a) for a in labels]
texts = [a for a in text]
tokenizer = Tokenizer()#num_words=MAX_NB_WORDS)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

#Preparing embedding
#At this point we can leverage our embedding_index dictionary and our word_index to compute our embedding matrix:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
embedding_vector = np.zeros((EMBEDDING_DIM))
good = 0
for word, i in word_index.items():
embedding_vector = w2v[word]
except KeyError:
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
good += 1
prototype = np.zeros((EMBEDDING_DIM))
co = sum([1 if (prototype == a).all() else 0 for a in embedding_matrix]) - 1
print('number of zero vectors:', co)
print('Found word vectors for %d words of a total of %d unique words' % (good, len(word_index)))

#We load this embedding matrix into an Embedding layer. Note that we set trainable=False to prevent the weights from being updated during training.
embedding_layer = Embedding(len(word_index) + 1,
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(classes), activation='softmax')(x)

model = Model(sequence_input, preds)
metrics=['acc']), y_train, validation_data=(x_val, y_val),
epochs=10, batch_size=128)



def lemmatization(text):
print('Starting lemmatization of text')
import spacy
nlp = spacy.load("en_core_web_sm")
text = pd.Series(text).str.lower()
# replace airline company twitter names
text = pd.Series(text).str.replace(r'@\w+', '')
text = pd.Series(text).str.replace('@[^\s]+','')
text = pd.Series(text).str.replace(r'http.?://[^\s]+[\s]?', '')
text = pd.Series(text).str.replace(r'&amp', '')
text = pd.Series(text).str.replace(r'&gt', '')
text = pd.Series(text).str.replace(r'&lt', '')
text = pd.Series(text).str.replace('[^\w\s]','')
text = pd.Series(text).str.lstrip()
text = pd.Series(text).str.rstrip()
# common spelling mistakes
text = pd.Series(text).str.replace(r'\bcudtomers\b', 'customers')
text = pd.Series(text).str.replace(r'\bppl\b', 'people')
text = pd.Series(text).str.replace(r'\biphone\b', 'phone')
text = pd.Series(text).str.replace(r'#([^\s]+)', r'\1')
text = text.apply(lambda row: [w.lemma_ for w in nlp(row)])
text = [' '.join(i) for i in text]
return text

class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec, dim=0):
self.word2vec = word2vec
self.word2weight = None
if len(word2vec)>0:
first = list(word2vec.keys())[0]

def fit(self, X, y):
tfidf = TfidfVectorizer(analyzer=lambda x: x)
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of
# known idf's
max_idf = max(tfidf.idf_)
self.word2weight = defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

return self

def transform(self, X):
return np.array([np.mean([self.word2vec[w] * self.word2weight[w] \
for w in words if w in self.word2vec] \
or [np.zeros(self.dim)], axis=0) for words in X])

class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
first = list(word2vec.keys())[0]

def fit(self, X, y):
return self

def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
@@ -0,0 +1,59 @@
<h1>Readme file for simple program to run sentiment analysis on US Airline Twitter data set</h1>

Logistic Regression, Support Vector Machine, Naive Bayes and simple convolutional network used

Included in the rep should be the following:
- - run script from where the algorithms are executed
- - utilities used by the script
- data/ - location to move data resources to
- word2vec_twitter_master/ - folder containing script and utilities to unpack download embedding model


Program and code is developed and tested on OSX Version 10.13.4 Anaconda built python 3.6 environment
The following packages are required to successfully execute the code:
- matplotlib
- pandas
- numpy
- gensim
- sklearn
- keras
- spacy ("en_core_web_sm" model used)

Move "Tweets-ariline-sentiment.csv" to "data" folder

Embedded models, to make the glove and twitter embedding model work one must download and locate in the "data" folder:

GloVe Twitter 27b 200d pre-trained model (glove.twitter.27B.200d.txt):

Fredéric Godin Twitter 400mill 400d pre-trained model (word2vec_twitter_model.bin): - alternatively

<h2>Execution of the program</h2>
In the terminal navigate to the unpacked "sentimentanalysis" folder.
Use the following syntax for execution **"python algorithm feature-set lemmatization(optional)"** eg. "python log 2" for Logistic regression model with bigram feature set. Lemmatization is optional, i.e. if not included no lemmatization will be made.

Available algorithms are:
- "log" for LogisticRegression
- "svm" for Support Vector Machine
- "nb" for Multinomial Naive Bayes(for feature_sets 1-3) + Gaussian Naive Bayes(for feature_sets 4-6)
- "neural" for network with conv1d layers ('only works with feature sets 4-6')

Available feature sets are:
- "1" for unigram
- "2" for bigram
- "3" for combined uni+bigram
- "4" for self embedding
- "5" for GloVe Twitter embedding
- "6" for Frederic Godin Twitter embedding

