In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import nltk
nltk.download('words')
from sklearn.preprocessing import LabelEncoder
nltk.download('averaged_perceptron_tagger')

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline

words = set(nltk.corpus.words.words())
import concurrent.futures


In [None]:
reviews  = pd.read_csv('Reviews_Equal.csv', lineterminator='\n',index_col=0)

In [None]:
reviews.head()

In [None]:
reviews['text_processed'] = reviews['Text'].map(lambda x: re.sub("[\[\]']", '', x))
reviews['text_processed'] = reviews['text_processed'].map(lambda x: re.sub("[,]", ' ', x))

In [None]:
reviews = reviews.reset_index()
reviews = reviews.drop(columns = ['index'])
reviews.isnull().sum()

In [None]:
# Remove punctuation
reviews['text_processed'] = reviews['Text'].map(lambda x: re.sub('[\(\),\.!?=]+', '', x))
# Convert the titles to lowercase
reviews['text_processed'] = reviews['text_processed'].map(lambda x: x.lower())
reviews['text_processed'] = reviews['text_processed'].map(lambda x: ' ' + x + ' ')

In [None]:
def get_good_tokens(sentence):
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [None]:
nltk.download('punkt')
executor = concurrent.futures.ProcessPoolExecutor() 
def lda_get_good_tokens(df):
    df['Text'] = df.Text.str.lower()
    df['tokenized_text'] = list(map(nltk.word_tokenize, df.Text))
    df['tokenized_text'] = list(map(get_good_tokens, df.tokenized_text))
#     df['tags'] = list(map(nltk.pos_tag,df.tokenized_text))

lda_get_good_tokens(reviews)

In [None]:
executor = concurrent.futures.ProcessPoolExecutor() 
reviews['tags'] = list(executor.map(nltk.pos_tag,reviews.tokenized_text))

In [None]:
def remove_stopwords(df):
    """ Removes stopwords based on a known set of stopwords
    available in the nltk package. In addition, we include our
    made up word in here.
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 
                       'take', 'use', 'would', 'can','de', 'la', 'e', 'que', 'en', 'empresa', 'lo', 'con', 'el',
                       'para','nt','!']
    stopwords = stopwords+stopwords_verbs

    df['stopwords_removed'] = list(map(lambda doc:
                                       [word for word in doc if word not in stopwords],
                                       df['tokenized_text']))

remove_stopwords(reviews)

In [None]:
def stem_words(df):
    lemm = nltk.stem.WordNetLemmatizer()
    df['lemmatized_text'] = list(map(lambda sentence:
                                     list(map(lemm.lemmatize, sentence)),
                                     df.stopwords_removed))

    p_stemmer = nltk.stem.porter.PorterStemmer()
    df['stemmed_text'] = list(map(lambda sentence:
                                  list(map(p_stemmer.stem, sentence)),
                                  df.lemmatized_text))

stem_words(reviews)

In [None]:
reviews.columns
reviews = reviews.drop(columns=['Review_title','Review_Star','Designation','Location','Date','Text','Helpful_Yes',
                                'Helpful_No','Company','lang','text_processed','tokenized_text','tags'],axis=1)
reviews.to_csv('Reviews_Classification.csv')

In [None]:
def get_strings(review):
    stri = ''
    for token in review:
        stri += ' ' + str(token)
    return stri

In [None]:
import concurrent.futures
text = []
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(get_strings,reviews.stemmed_text)):
#         if(i%1000==0):
#             print(i)
        text.append(str(analyse))

reviews['preproci'] = pd.Series(text)

In [None]:
reviews= reviews.drop(columns=['text_processed','tokenized_text','stopwords_removed','lemmatized_text','stemmed_text'])

In [None]:
get_strings(reviews.stemmed_text[0])

In [None]:
reviews.head()

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [None]:
# split the dataset into training and validation datasets 
train_x,test_x, train_y, test_y = model_selection.train_test_split(reviews['preproci'], 
                                    reviews['Employment_Status'],test_size = 0.3, random_state=42)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(reviews['preproci'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(reviews['text_processed'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)

# # ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(reviews['preproci'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

# # characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(reviews['preproci'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

In [None]:
from gensim.models import Word2Vec
import gensim
import logging

from numpy import random


import re


wv = gensim.models.KeyedVectors.load_word2vec_format("glove.840B.300d.w2vformat.txt", binary=True)
wv.init_sims(replace=True)


def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    


test_tokenized = test_x.apply(lambda r: w2v_tokenize_text(r)).values
train_tokenized = train_x.apply(lambda r: w2v_tokenize_text(r)).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)


In [None]:
!python -m gensim.scripts.glove2word2vec --input  ../../glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y)

In [None]:
# model = ensemble.RandomForestClassifier(n_estimators=500,n_jobs=24)
# model.fit(xtrain_count,train_y)
predictions = model.predict(xtrain_count)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(train_y, predictions))
print(metrics.accuracy_score(predictions, train_y))

In [None]:
confusion_matrix(test_y,predictions)

In [None]:
test_y

In [None]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
print("NB, Count Vectors: " +str(accuracy))

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: " +str(accuracy))

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: "+ str(accuracy))

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: " +str(accuracy))



In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(max_iter=500,n_jobs=-1), xtrain_count, train_y, xtest_count)
print("LR, Count Vectors: " +str(accuracy))

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(max_iter=500,n_jobs=-1), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: " +str(accuracy))

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(max_iter=500,n_jobs=-1), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: "+ str(accuracy))

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(max_iter=500,n_jobs=-1), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: " +str(accuracy))

# accuracy = train_model(linear_model.LogisticRegression(max_iter=500,n_jobs=-1), X_train_word_average, train_y, X_test_word_average)
# print("LR, Word2Vec Vectors: " +str(accuracy))




In [None]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: " +str(accuracy))

In [None]:
RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(n_jobs=-1), xtrain_count, train_y, xtest_count)
print("RF, Count Vectors: " +str(accuracy))

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(n_jobs=24), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: " +str(accuracy))

accuracy = train_model(ensemble.RandomForestClassifier(n_jobs=24), X_train_word_average, train_y, X_test_word_average)
print("RF, WordLevel TF-IDF: " +str(accuracy))

In [None]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(n_jobs=24), xtrain_count.tocsc(), train_y, xtest_count.tocsc())
print("Xgb, Count Vectors: "+ str(accuracy))

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(n_jobs=24), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: "+ str(accuracy))

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(n_jobs=24), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: "+ str(accuracy))

In [None]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
import tensorflow as tf
model = Sequential()
model.add(Embedding(vocabulary_size, 300, input_length=275, weights=[embedding_matrix], trainable=False))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
# model.add(Dense(256, activation='relu'))
model.add(Dense(128,activation='relu'))
# model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
# model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(16,activation='relu'))
model.add(Dense(8,activation='relu'))
model.add(Dense(4,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='adam', metrics=['accuracy'], loss = 'sparse_categorical_crossentropy')
# tf.sparse.reorder(xtrain_tfidf_ngram)
# tf.sparse.reorder(train_y)
model.fit(X_train, y_train, validation_data=(X_test,y_test),
          epochs=50,batch_size=512, shuffle=True)
