## some documentation to check
- process slang:
    * https://github.com/vi3k6i5/flashtext1

In [1]:
# labels
highlights = {
    # related with speech recognition
    'professional qualities': ['handles pressure'],
    'soft skills': ['silence'],
    'answer analysis': ['filler words', 'long pause', 'focus', 'patience'], 

    'polarities': {
        'negative': [
                     # confidence
                     'not confident', 
                     'unsure',

                     # professional qualities
                     '',
                     'disordered',
                     'talkative',
                     'uninsterested', # 'engaged'

                     # soft skills
                     'sad',
                     'unfriendly'
                     ],

        'positive': [
                     # confidence
                     'confident', 
                     'certany',

                     # professional qualities
                     'handles pressure',
                     'organized',
                     'concise', 
                     'interested', # 'engaged'

                     # soft skills
                     'happy',
                     'friendly'
                     ]
    }
}

main_lst = list(highlights.values())
main_labels = [k for j in main_lst for k in j]

neg_pos_lst = highlights['polarities'].values()
neg_pos_labels = [k for j in neg_pos_lst for k in j]

In [2]:
neg_pos_lst

dict_values([['not confident', 'unsure', '', 'disordered', 'talkative', 'uninsterested', 'sad', 'unfriendly'], ['confident', 'certany', 'handles pressure', 'organized', 'concise', 'interested', 'happy', 'friendly']])

In [3]:
neg_pos_labels

['not confident',
 'unsure',
 '',
 'disordered',
 'talkative',
 'uninsterested',
 'sad',
 'unfriendly',
 'confident',
 'certany',
 'handles pressure',
 'organized',
 'concise',
 'interested',
 'happy',
 'friendly']

---
# **1. Installations**
---

In [4]:
!pip install nltk 
!pip install stanza
!pip install spacy
!pip3 install flair
!pip install textblob

!pip install emoji --upgrade

!spacy download en_core_web_sm # sm md lg
!python -m spacy download en

Requirement already up-to-date: emoji in /usr/local/lib/python3.6/dist-packages (0.6.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


---
# **2. Imports and downloads**
---

## Core

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import re

from emoji import demojize

## NLTK
* Words
* Stopwords
* WordNetLemmatizer
* Vader

In [6]:
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# words
NLTK_WORDS = set(words.words())

# Stopwords
STOPWORDS = set(stopwords.words('english'))

# Vader
SIA = SentimentIntensityAnalyzer()



[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Stanza
* ewt; tokenize, mwt, pos, lemma
* default; tokenize, sentiment 

In [7]:
import stanza

stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True)
stanza.download('en', package='default', processors='tokenize,sentiment', verbose=True)

stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,sentiment',
                      lang='en',
                      use_gpu=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 10.7MB/s]                    
2020-08-29 05:29:01 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| pretrain  | ewt     |

2020-08-29 05:29:01 INFO: File exists: /root/stanza_resources/en/tokenize/ewt.pt.
2020-08-29 05:29:01 INFO: File exists: /root/stanza_resources/en/pos/ewt.pt.
2020-08-29 05:29:01 INFO: File exists: /root/stanza_resources/en/lemma/ewt.pt.
2020-08-29 05:29:02 INFO: File exists: /root/stanza_resources/en/pretrain/ewt.pt.
2020-08-29 05:29:02 INFO: Finished downloading models and saved to /root/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 9.54MB/s]                    
2020-08-29 05:29:02 INFO: Downloading these customized

## SpaCy
* en_core_web sm

In [8]:
import spacy

spNLP = spacy.load('en_core_web_sm')
spNLP.max_length = 103950039 # or higher
# spacy.prefer_gpu() #will not work with stanza

## TextBlob
* use a bag of words classifier, but the advantage is that it includes subjetivity analysis (factual/opinated)
* it doesn't contain the heuristics that nltk has, it won't intensify or negate a sentence's sentiment

* will return the subjectivity of the text

In [9]:
from textblob import TextBlob

## **Flair**
* classifier based on a character-leval LSTM. Takes a sequences of letters and words into account when predicting

* one of its biggest advantages is that it can predict a sentiment for OOV words that it has never seen before too (such as typos)

In [10]:
import flair
flair_sent = flair.models.TextClassifier.load('en-sentiment')

2020-08-29 05:29:08,093 loading file /root/.flair/models/sentiment-en-mix-distillbert.pt


---
# **3. Functions**
---

## **i. Lemmatizers**

In [11]:
def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)

In [12]:
# stanza
def stanza_lemma(text):
    doc = stNLP(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

## ii. Sentiment Analyzers
---

### NLTK Vader
* VADER, has different ratings depending on the form of the word and therefore the input should not be stemmed or lemmatized.

* disadvantage of this approach is that Out of Vocab (OOV) words that the sentiment analysis tool has not seen before will not be classified as positive/negative (e.g. typos).

In [13]:
def siaVader_compound(text):
    scores = SIA.polarity_scores(text)
    
    comp_score = scores['compound']
    if comp_score >= 0.05:
        str_comp = 'pos'
    elif comp_score <= -0.05:
        str_comp = 'neg'
    else: # (compound score > -0.05) and (compound score < 0.05)
        str_comp = 'neu'
    return str_comp

def siaVader_maxScore(text):
    scores = SIA.polarity_scores(text)
    
    del scores['compound']
    index = np.argmax(list(scores.values()))
    vader_MaxScore = list(scores.values())[index]
    vader_label = list(scores)[index]
    
    return vader_label

###
def siaVader_byWord(text):
    c = 0
    for n, y in enumerate(text):
        x = SIA.polarity_scores(y)
        if x['compound'] != 0.0:
            c += 1
            # print('{}. {} {}'.format(c, x, y))
            return 'pos' if x > 0.05 else 'neg'

### **TextBlob**
* use a bag of words classifier, but the advantage is that it includes subjetivity analysis (factual/opinated)
* it doesn't contain the heuristics that nltk has, it won't intensify or negate a sentence's sentiment

* will return the subjectivity of the text

In [14]:
def text_blob_subject(text):
    return TextBlob(text).sentiment

### **Flair LSTM**
* classifier based on a character-leval LSTM. Takes a sequences of letters and words into account when predicting

* one of its biggest advantages is that it can predict a sentiment for OOV words that it has never seen before too (such as typos)

In [15]:
def flair_lstm(text):
    sentence = flair.data.Sentence(text)
    flair_sent.predict(sentences=sentence)
    total_sent = sentence.labels
    return total_sent 

### **Stanza**
* stanza pipeline by using a CNN classifier.
* training this model on 2 class data using higher dimension word vectors achieves the 87 score reported in the original CNN classifier paper. On a three class projection of the SST test data, the model trained on multiple datasets gets 70.0%.

In [16]:
def stanza_funct(text):
    data = stNLP(text)
    for sentence in data.sentences:
        return sentence.sentiment

---
## **iii. Load dataset**

In [17]:
def load_data(data_path=None):
    print('load the dataset...\n')
    !mkdir -p data
    !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
    !unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

---
# **4. Preprocess dataset**
---

In [18]:
def preprocess_dataset(PATH_FILE, index_col=None):
    print('preprocess the dataset...\n')

    # load_data
    load_data()
    print('Database loaded\n')

    # cleaning data
    unclean_df = pd.read_csv(PATH_FILE,
                     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                     encoding='latin-1') # if utf-8: UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 232719-232720: invalid continuation byte

    # replace polarity
    unclean_df.polarity = unclean_df.polarity.replace({0: 0, 4: 1}) 
    
    # dropping unneeded columns
    unclean_df = unclean_df.drop(columns=['id', 'date', 'query', 'user']) 

    # lower case
    unclean_df['text'] = unclean_df['text'].str.lower()

    # removing urls
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'https://www\.|http:\.|https://|www\.', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil|cl)[\S]*\s?', '', x))

    # remove special character and numbers
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-zÁ-Úá-ú \t])|(\w+:\/\/\S+)|^rt|http.+?%', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # remove repetitions (goood ==> good ==> god?; whaaat ==> what)
    pattern = re.compile(r'(.)\1{2,}', re.DOTALL)
    unclean_df['text'] = unclean_df['text'].str.replace(pattern, r'\1')

    # removing empty values
    nan_value = float('NaN')
    unclean_df.replace('', nan_value, inplace=True)
    unclean_df.dropna(inplace=True) # add subset

    # removing stopwords
    #df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in (STOPWORDS)]))

    # filtering and removing non-english words or misspelling
    #df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i.lower() in NLTK_WORDS or not i.isalpha()]))

    # rewritting the created file without NaN values
    unclean_df.to_csv('data/sentiment140-subset.csv', 
              quotechar='"', # check later!
              encoding='utf-8',
              index=False)

    # clean csv
    df = pd.read_csv('data/sentiment140-subset.csv', encoding='utf-8', warn_bad_lines=True).dropna()

    # checking if there's any NaN values
    isnull = [i for i in (df['text'].isnull()) if i == True]
    if isnull != []:
        sys.exit(0) # add response object here

    return df

In [19]:
df = preprocess_dataset(PATH_FILE='data/training.1600000.processed.noemoticon.csv')

preprocess the dataset...

load the dataset...

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip
Database loaded



In [20]:
from emoji import demojize, emojize
# removing emojis
#df['text'] = df['text'].apply(lambda x: demojize(string=x))
#df_emojis = df['text'].apply(lambda x: re.findall(r':[a-z_]+:', string=demojize(x)))

In [21]:
#for n, i in enumerate(df.emojis):
#    if i != []:
#        print(n, emojize(str(i)))

In [22]:
test_csv = df.to_csv('checking_csv.csv', quotechar='"', encoding='utf-8')

In [23]:
'''
emo_dict = {}
for i, j in df_emojis.iteritems():
    for k in j:
        if k in emo_dict:
            emo_dict[k] += 1
        else:
            emo_dict[k] = 1

df_hashtags = df['text'].apply(lambda x: re.findall(r'#/S+', string=x))
hashtags = {}
for i, j in df_hashtags.iteritems(): 
    for k in j:
        if k in hashtags:
            hashtags[k] += 1
        else:
            hashtags[k] = 1
            
for i, c in sorted(emo_dict.items(), key=lambda x: x[1], reverse=True):
    print(emojize(i) + i + str(c))
'''

"\nemo_dict = {}\nfor i, j in df_emojis.iteritems():\n    for k in j:\n        if k in emo_dict:\n            emo_dict[k] += 1\n        else:\n            emo_dict[k] = 1\n\ndf_hashtags = df['text'].apply(lambda x: re.findall(r'#/S+', string=x))\nhashtags = {}\nfor i, j in df_hashtags.iteritems(): \n    for k in j:\n        if k in hashtags:\n            hashtags[k] += 1\n        else:\n            hashtags[k] = 1\n            \nfor i, c in sorted(emo_dict.items(), key=lambda x: x[1], reverse=True):\n    print(emojize(i) + i + str(c))\n"

In [24]:
# testing
df_test = pd.read_csv('emo_test.csv', sep='\t')
df_test['SIA'] = df_test['word'].apply(lambda x: siaVader_compound(x))
df_test['comp_val'] = df_test['word'].apply(lambda x: (SIA.polarity_scores(x))['compound'])
df_test['VADER_pos_neg'] = df_test['word'].apply(
    lambda x: SIA.polarity_scores(x)['pos'] if SIA.polarity_scores(x)['pos'] > SIA.polarity_scores(x)['neg'] else SIA.polarity_scores(x)['neg']
    )

In [25]:
df_test['flair'] = df_test['word'].apply(lambda x: flair_lstm(x))

In [27]:
df['comp_label'] = df['text'].apply(lambda x: siaVader_compound(text=x))

In [None]:
df['maxScore_label'] = df['text'].apply(lambda x: siaVader_maxScore(text=x))

In [None]:
df['comp_score'] = df['text'].apply(lambda x: SIA.polarity_scores(x)['compound'])

In [None]:
df['SIA'] = df['text'].apply(lambda x: SIA.polarity_scores(x))

In [None]:
df['flair'] = df['text'].apply(lambda x: flair_lstm(x))

In [None]:
df

In [None]:
df.to_csv('check_csv_2.csv', encoding='utf-8')

In [None]:
df.tail(60)