In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import tokenizers
from tokenizers import normalizers
from tokenizers.pre_tokenizers import WhitespaceSplit,Split
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Punctuation, WhitespaceSplit
from tokenizers.normalizers import Lowercase, Replace
from tokenizers import Regex

In [None]:
def train_tokenizer() -> tokenizers.Tokenizer:
    PAD_TOKEN = '<pad>'
    UNK_TOKEN = '<unk>'
    NUM_TOKEN = '<num>'
    START_TOKEN= '<bos>'
    END_TOKEN= '<eos>'
    tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
    tokenizer.normalizer = normalizers.Sequence([normalizers.Lowercase(),
                                                 normalizers.Replace(Regex(r'[^\w\s]'), ""),
                                                 normalizers.Replace(Regex('[0-9]+'), NUM_TOKEN)
                                                 ])
    trainer = WordLevelTrainer(special_tokens=[PAD_TOKEN,UNK_TOKEN,NUM_TOKEN,START_TOKEN,END_TOKEN])
    tokenizer.pre_tokenizer = WhitespaceSplit()
    files = ["amazon_train.txt","deloitte_train.txt","google_train.txt","meta_train.txt"]
    tokenizer.train(files, trainer)

    tokenizer.post_processor = TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[
        (START_TOKEN, tokenizer.token_to_id(START_TOKEN)),
        (END_TOKEN, tokenizer.token_to_id(END_TOKEN)),
         ],
    )
    # tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
    return tokenizer

In [None]:
tokenizer = train_tokenizer()
vocab_size = tokenizer.get_vocab_size()
print(vocab_size)

11845


In [None]:
def token_gen(fname: str,
                  tokenizer: tokenizers.Tokenizer) :
    return_sents = []
    all_tokens = []
    with open(fname, "r", encoding="utf-8") as file:
        sentences = file.readlines()
    for sentence in sentences:
        tokens = tokenizer.encode(sentence).tokens
        line = " ".join(tokens)
        return_sents.append(line+"\n")
    return return_sents

In [None]:
sents_amazon = token_gen("amazon.txt",tokenizer)

sents_deloitte = token_gen("deloitte.txt",tokenizer)
sents_meta = token_gen("meta.txt",tokenizer)
sents_google = token_gen("google.txt",tokenizer)

In [None]:
from transformers import pipeline
def sentiment_analysis(fname,model) :
    #specific_model = pipeline(model="Kaludi/Reviews-Sentiment-Analysis")
    resultant =[]
    for sent in fname:
      output = model(sent)
      x= output[0]['label']
      y = output[0]['score']
      resultant.append([sent,x,y])
    return resultant

In [None]:
model = pipeline(model="Kaludi/Reviews-Sentiment-Analysis")

In [None]:
resultant_amazon = sentiment_analysis(sents_amazon,model)

In [None]:
resultant_deloitte = sentiment_analysis(sents_deloitte,model)

In [None]:
resultant_meta = sentiment_analysis(sents_meta,model)


In [None]:
resultant_google = sentiment_analysis(sents_google,model)

In [None]:
print(resultant_amazon[0])

['<bos> so this is my first week at amazon i am completing my <num>th day of work and i absolutely love it looking forward to my day off lol <eos>\n', 'Positive', 0.9855977892875671]


In [None]:
columns = ['Text', 'Sentiment', 'Confidence']
df_amazon = pd.DataFrame(resultant_amazon, columns=columns)
average_confidence = df_amazon['Confidence'].mean()
print("Average Confidence:", average_confidence)

positive_count = df_amazon['Sentiment'].value_counts().get('Positive', 0)
Positive_percentage = positive_count/len(df_amazon)
print(Positive_percentage)

Average Confidence: 0.8187582866704701
0.919755877034358


In [None]:
columns = ['Text', 'Sentiment', 'Confidence']
df_deloitte = pd.DataFrame(resultant_deloitte, columns=columns)
average_confidence = df_deloitte['Confidence'].mean()
print("Average Confidence:", average_confidence)

positive_count = df_deloitte['Sentiment'].value_counts().get('Positive', 0)
Positive_percentage = positive_count/len(df_deloitte)
print(Positive_percentage)

Average Confidence: 0.8340425815196396
0.9482177263969171


In [None]:
columns = ['Text', 'Sentiment', 'Confidence']
df_meta = pd.DataFrame(resultant_meta, columns=columns)
average_confidence = df_meta['Confidence'].mean()
print("Average Confidence:", average_confidence)

positive_count = df_meta['Sentiment'].value_counts().get('Positive', 0)
Positive_percentage = positive_count/len(df_meta)
print(Positive_percentage)

Average Confidence: 0.8124302262804981
0.9216904909881914


In [None]:
columns = ['Text', 'Sentiment', 'Confidence']
df_google = pd.DataFrame(resultant_google, columns=columns)
average_confidence = df_google['Confidence'].mean()
print("Average Confidence:", average_confidence)

positive_count = df_google['Sentiment'].value_counts().get('Positive', 0)
Positive_percentage = positive_count/len(df_google)
print(Positive_percentage)

Average Confidence: 0.8077500297998389
0.9010416666666666


In [None]:
filtered_df_amazon = df_amazon[df_amazon['Sentiment'] == 'Negative']
filtered_df_deloitte = df_deloitte[df_deloitte['Sentiment'] == 'Negative']
filtered_df_meta = df_meta[df_meta['Sentiment'] == 'Negative']
filtered_df_google = df_google[df_google['Sentiment'] == 'Negative']

In [None]:
from nltk import FreqDist
def token_filter(sentences, thresh=5):
    vocab = []
    tokens=[]
    sents = sentences

    for i in range(len(sents)):
          tokens += sents[i].split(" ")

    fq = FreqDist(token.lower() for token in tokens)

    filtered_tokens = [token for token in tokens if (fq[token] <= 15) ]
    vocab = list(set(filtered_tokens))
    return vocab


In [None]:
my_list_amazon = list(filtered_df_amazon['Text'])
vocab_amazon = token_filter(my_list_amazon)


In [None]:
my_list_deloitte = list(filtered_df_deloitte['Text'])
vocab_deloitte = token_filter(my_list_deloitte)


In [None]:
my_list_meta = list(filtered_df_meta['Text'])
vocab_meta = token_filter(my_list_meta)

In [None]:
my_list_google = list(filtered_df_google['Text'])
vocab_google = token_filter(my_list_google)

In [None]:

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import  numpy as np
from PIL import Image
import requests

stopwords = set(STOPWORDS)


In [None]:
filtered_df_amazon = pd.DataFrame({'Text':filtered_df_amazon.Text})

In [None]:
filtered_df_deloitte = pd.DataFrame({'Text':filtered_df_deloitte.Text})

In [None]:
filtered_df_meta = pd.DataFrame({'Text':filtered_df_meta.Text})

In [None]:
filtered_df_google = pd.DataFrame({'Text':filtered_df_google.Text})

In [None]:
comment_words = ''
f = open("comment_words_amazon_neg.txt",'w')
for val in filtered_df_amazon['Text']:
    tokens = val.lower().split()
    tokens=[value for value in tokens if value != '<bos>' and value != '<eos>' and value != '<num>' and value != '<unk>' and value not in vocab_amazon ]
    comment_words += " ".join(tokens)+" "
    f.write(" ".join(tokens)+" ")
f.close()

In [None]:
comment_words = ''
f = open("comment_words_deloitte_neg.txt",'w')
for val in filtered_df_deloitte['Text']:
    tokens = val.lower().split()
    tokens=[value for value in tokens if value != '<bos>' and value != '<eos>' and value != '<num>' and value != '<unk>' and value not in vocab_deloitte ]
    comment_words += " ".join(tokens)+" "
    f.write(" ".join(tokens)+" ")
f.close()

In [None]:
comment_words = ''
f = open("comment_words_meta_neg.txt",'w')
for val in filtered_df_meta['Text']:
    tokens = val.lower().split()
    tokens=[value for value in tokens if value != '<bos>' and value != '<eos>' and value != '<num>' and value != '<unk>' and value not in vocab_meta ]
    comment_words += " ".join(tokens)+" "
    f.write(" ".join(tokens)+" ")
f.close()

In [None]:
comment_words = ''
f = open("comment_words_google_neg.txt",'w')
for val in filtered_df_google['Text']:
    tokens = val.lower().split()
    tokens=[value for value in tokens if value != '<bos>' and value != '<eos>' and value != '<num>' and value != '<unk>' and value not in vocab_google ]
    comment_words += " ".join(tokens)+" "
    f.write(" ".join(tokens)+" ")
f.close()

## POSITIVE COMMENTS

In [None]:
filtered_df_amazon = df_amazon[df_amazon['Sentiment'] != 'Negative']
filtered_df_deloitte = df_deloitte[df_deloitte['Sentiment'] != 'Negative']
filtered_df_meta = df_meta[df_meta['Sentiment'] != 'Negative']
filtered_df_google = df_google[df_google['Sentiment'] != 'Negative']

In [None]:
from nltk import FreqDist
def token_filter(sentences, thresh=5):
    vocab = []
    tokens=[]
    sents = sentences

    for i in range(len(sents)):
          tokens += sents[i].split(" ")

    fq = FreqDist(token.lower() for token in tokens)

    filtered_tokens = [token for token in tokens if (fq[token] <= 10) ]
    vocab = list(set(filtered_tokens))
    return vocab

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import  numpy as np
from PIL import Image
import requests

def gen_word_cloud(df,fname):
    stopwords = set(STOPWORDS)
    my_list = list(df['Text'])
    vocab = token_filter(my_list)
    filtered_df = pd.DataFrame({'Text':df.Text})
    comment_words = ''
    f = open(fname,'w')
    for val in df['Text']:
        tokens = val.lower().split()
        tokens=[value for value in tokens if value != '<bos>' and value != '<eos>' and value != '<num>' and value != '<unk>' and value not in vocab and value not in stopwords]
        comment_words += " ".join(tokens)+" "
        f.write(" ".join(tokens)+" ")
    f.close()

In [None]:
gen_word_cloud(filtered_df_amazon,"comment_words_amazon_pos.txt")

In [None]:
gen_word_cloud(filtered_df_deloitte,"comment_words_deloitte_pos.txt")

In [None]:
gen_word_cloud(filtered_df_meta,"comment_words_meta_pos.txt")

In [None]:
gen_word_cloud(filtered_df_google,"comment_words_google_pos.txt")

## Embeddings

In [None]:
entire_dataset = sents_amazon +sents_deloitte+sents_meta+sents_google
from nltk.probability import FreqDist

def token_filter(dataset, thresh=5):
    vocab = []
    tokens=[]
    #Read the tokens from the file
    sents = dataset
    filtered_sents=[]
    for i in range(len(sents)):
      tokens += sents[i].split(" ")
    # calculate the frequency of the words
    fq = FreqDist(token.lower() for token in tokens)
    filtered_tokens = [token for token in tokens if fq[token] >= 5 and token != '<bos>' and token != '<eos>' and token != '<num>' and token != '<unk>' ]
    vocab = list(set(filtered_tokens))
    for i in range(len(sents)-2) :
      words = sents[i].split(" ")
      filtered_words = [token for token in words if fq[token] >= 5 and token != '<bos>' and token != '<eos>' and token != '<num>' and token != '<unk>' ]
      text = " ".join(filtered_words)
      filtered_sents.append(text)
    return vocab,filtered_sents

vocab,dataset = token_filter(entire_dataset)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

def LSA(dataset, dim=50):
    sents = dataset
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), lowercase=False, binary=False)
    word_doc_matrix = vectorizer.fit_transform(sents)
    word_doc_matrix=word_doc_matrix.transpose()
    svd = TruncatedSVD(n_components=50)
    embeddings = svd.fit_transform(word_doc_matrix)
    vocab = {word:index for index, word in enumerate(vectorizer.get_feature_names_out())}
    return embeddings,vocab
embeddings_dataset,vocab_dataset=LSA(dataset)

In [None]:
vocab_amazon,dataset_amazon = token_filter(sents_amazon)
embeddings_amazon,vocab_amazon = LSA(dataset_amazon)

In [None]:
vocab_deloitte,dataset_deloitte = token_filter(sents_deloitte)
embeddings_deloitte,vocab_deloitte = LSA(dataset_deloitte)

In [None]:
vocab_meta,dataset_meta = token_filter(sents_meta)
embeddings_meta,vocab_meta = LSA(dataset_meta)

In [None]:
vocab_google,dataset_google = token_filter(sents_google)
embeddings_google,vocab_google = LSA(dataset_google)

In [None]:
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

def correlation(word, embeddings_dataset,embeddings_company,vocab_dataset,vocab_company):
    if word in vocab_dataset and word in vocab_company:
        index_dataset = vocab_dataset[word]
        index_company = vocab_company[word]
    embedding_dataset = embeddings_dataset[index_dataset]
    embedding_company = embeddings_company[index_company]

    embedding_dataset = embedding_dataset.reshape(1, -1)
    embedding_company = embedding_company.reshape(1, -1)
    cosine_similarity_score = cosine_similarity(embedding_dataset, embedding_company)[0][0]
    return cosine_similarity_score


In [None]:
x = correlation("work",embeddings_dataset,embeddings_amazon,vocab_dataset,vocab_amazon)

In [None]:
print(x)

0.48048370628613357


In [None]:
print("Positive word correlation")
positive_words=['work','job','pay','position','thank']
for word in positive_words:
    print("Amazon: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_amazon,vocab_dataset,vocab_amazon)))
    print("Deloitte: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_deloitte,vocab_dataset,vocab_deloitte)))
    print("Meta: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_meta,vocab_dataset,vocab_meta)))
    print("Google: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_google,vocab_dataset,vocab_google)))
    print()

print("Negative word correlation")
negative_words=['people','experience','manager','hr','company']
for word in negative_words:
    print("Amazon: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_amazon,vocab_dataset,vocab_amazon)))
    print("Deloitte: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_deloitte,vocab_dataset,vocab_deloitte)))
    print("Meta: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_meta,vocab_dataset,vocab_meta)))
    print("Google: word = {} and correlation = {}".format(word,correlation(word,embeddings_dataset,embeddings_google,vocab_dataset,vocab_google)))
    print()


Positive word correlation
Amazon: word = work and correlation = 0.48048370628613357
Deloitte: word = work and correlation = 0.14394640271696085
Meta: word = work and correlation = 0.1733925800374058
Google: word = work and correlation = 0.36960640316949367

Amazon: word = job and correlation = -0.13325785120658215
Deloitte: word = job and correlation = 0.1677679684670904
Meta: word = job and correlation = 0.0515980033161018
Google: word = job and correlation = 0.2737629760311749

Amazon: word = pay and correlation = 0.2689717678033564
Deloitte: word = pay and correlation = 0.3601465736769758
Meta: word = pay and correlation = 0.38293269688758313
Google: word = pay and correlation = 0.45323462663855557

Amazon: word = position and correlation = 0.36924063566828874
Deloitte: word = position and correlation = 0.2672624552595968
Meta: word = position and correlation = 0.264023754607288
Google: word = position and correlation = 0.32063112888045914

Amazon: word = thank and correlation = -0.