## some documentation
- process slang:
    * https://github.com/vi3k6i5/flashtext1

In [1]:
!pip install nltk 
!pip install stanza
!pip install spacy
!pip install emoji --upgrade
!spacy download en_core_web_sm # sm md lg
!python -m spacy download en

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |████████████████████████████████| 235kB 4.6MB/s 
Installing collected packages: stanza
Successfully installed stanza-1.1.1
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)
[K     |████████████████████████████████| 51kB 2.5MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49716 sha256=efef26295e578871657d143089f01ea9cdc8adde795e7d425e5d1f5570c9457c
  Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-

In [2]:
# nltk
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

STOPWORDS = set(stopwords.words('english'))
NLTK_WORDS = set(words.words())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [3]:
def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)

In [4]:
# Stanza NLP
import stanza

stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
                      lang='en',
                      use_gpu=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 19.9MB/s]                    
2020-08-28 02:27:40 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| pretrain  | ewt     |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/tokenize/ewt.pt: 100%|██████████| 631k/631k [00:00<00:00, 2.20MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/pos/ewt.pt: 100%|██████████| 22.1M/22.1M [00:01<00:00, 21.8MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/lemma/ewt.pt: 100%|██████████| 3.36M/3.36M [00:00<00:00, 6.92MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/pretrain/ewt.pt: 100%|██████████| 156M/156M [00:24<00:00, 6.39MB/s]
2020-08-28 02:28:09 INFO: Finished downloading models and saved to /root/stanza_resources.
2020-08-28

In [5]:
# stanza
def stanza_lemma(text):
    doc = stNLP(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

In [6]:
# Spacy NLP
import spacy
spNLP = spacy.load('en_core_web_sm')
spNLP.max_length = 103950039 # or higher
# spacy.prefer_gpu() #will not work with stanza

### **VADER**
* VADER, has different ratings depending on the form of the word and therefore the input should not be stemmed or lemmatized.

* disadvantage of this approach is that Out of Vocab (OOV) words that the sentiment analysis tool has not seen before will not be classified as positive/negative (e.g. typos).

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
SIA = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




### **TextBlob**
* use a bag of words classifier, but the advantage is that it includes subjetivity analysis (factual/opinated)
* it doesn't contain the heuristics that nltk has, it won't intensify or negate a sentence's sentiment

* will return the subjectivity of the text

In [8]:
from textblob import TextBlob

### **Flair**
* classifier based on a character-leval LSTM. Takes a sequences of letters and words into account when predicting

* one of its biggest advantages is that it can predict a sentiment for OOV words that it has never seen before too (such as typos)

In [9]:
!pip3 install flair
# tiene en cuenta secuencias de letras y palabras al predecir
import flair
flair_sent = flair.models.TextClassifier.load('en-sentiment')

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/09/b8/5d0c60b18926414786b988e1a7fbf0e010837b1bf4aa80b955571b552d59/flair-0.6-py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 5.5MB/s 
[?25hCollecting transformers>=3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 17.5MB/s 
[?25hCollecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/ff/e2/3b51c53dffb1e52d9210ebc01f1fb9f2f6eba9b3201fa971fd3946643c71/ftfy-5.8.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 8.9MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |██████████████████████████████

2020-08-28 02:29:05,878 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert.pt not found in cache, downloading to /tmp/tmpyd1prkx0


100%|██████████| 266170364/266170364 [00:55<00:00, 4811188.62B/s]

2020-08-28 02:30:01,666 copying /tmp/tmpyd1prkx0 to cache at /root/.flair/models/sentiment-en-mix-distillbert.pt





2020-08-28 02:30:01,927 removing temp file /tmp/tmpyd1prkx0
2020-08-28 02:30:01,959 loading file /root/.flair/models/sentiment-en-mix-distillbert.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [11]:
# labels
highlights = {
    # related with speech recognition
    'professional qualities': ['handles pressure'],
    'soft skills': ['silence'],
    'answer analysis': ['filler words', 'long pause', 'focus', 'patience'], 

    'polarities': {
        'negative': [
                     # confidence
                     'not confident', 
                     'unsure',

                     # professional qualities
                     '',
                     'disordered',
                     'talkative',
                     'uninsterested', # 'engaged'

                     # soft skills
                     'sad',
                     'unfriendly'
                     ],

        'positive': [
                     # confidence
                     'confident', 
                     'certany',

                     # professional qualities
                     'handles pressure',
                     'organized',
                     'concise', 
                     'interested', # 'engaged'

                     # soft skills
                     'happy',
                     'friendly'
                     ]
    }
}

main_lst = list(highlights.values())
main_labels = [k for j in main_lst for k in j]

neg_pos_lst = highlights['polarities'].values()
neg_pos_labels = [k for j in neg_pos_lst for k in j]

In [12]:
neg_pos_lst

dict_values([['not confident', 'unsure', '', 'disordered', 'talkative', 'uninsterested', 'sad', 'unfriendly'], ['confident', 'certany', 'handles pressure', 'organized', 'concise', 'interested', 'happy', 'friendly']])

In [13]:
neg_pos_labels

['not confident',
 'unsure',
 '',
 'disordered',
 'talkative',
 'uninsterested',
 'sad',
 'unfriendly',
 'confident',
 'certany',
 'handles pressure',
 'organized',
 'concise',
 'interested',
 'happy',
 'friendly']

In [14]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import re

from emoji import demojize

In [15]:
def load_data(data_path=None):
    print('load the dataset...\n')
    !mkdir -p data
    !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
    !unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

In [16]:
def preprocess_dataset(PATH_FILE, index_col=None):
    print('preprocess the dataset...\n')

    # load_data
    load_data()
    print('Database loaded\n')

    # cleaning data
    unclean_df = pd.read_csv(PATH_FILE,
                     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                     encoding='latin-1') # if utf-8: UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 232719-232720: invalid continuation byte

    # replace polarity
    unclean_df.polarity = unclean_df.polarity.replace({0: 0, 4: 1}) 
    
    # dropping unneeded columns
    unclean_df = unclean_df.drop(columns=['id', 'date', 'query', 'user']) 

    # lower case
    unclean_df['text'] = unclean_df['text'].str.lower()

    # removing urls
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'https://www\.|http:\.|https://|www\.', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil|cl)[\S]*\s?', '', x))

    # remove special character and numbers
    #unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-zÁ-Úá-ú \t])|(\w+:\/\/\S+)|^rt|http.+?%', '', x))
    #unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # remove repetitions (goood ==> god)
    pattern = re.compile(r'(.)\1{2,}', re.DOTALL)
    unclean_df['text'] = unclean_df['text'].str.replace(pattern, r'\1')

    # removing empty values replacing them with nan
    nan_value = float('NaN')
    unclean_df.replace('', nan_value, inplace=True)
    unclean_df.dropna(inplace=True) # add subset

    # removing stopwords
    #df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in (STOPWORDS)]))

    # filtering and removing non-english words or misspelling
    #df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i.lower() in NLTK_WORDS or not i.isalpha()]))

    # rewritting the created file without NaN values
    unclean_df.to_csv('data/sentiment140-subset.csv', 
              quotechar='"', # check later!
              encoding='utf-8',
              index=False)

    # clean csv
    df = pd.read_csv('data/sentiment140-subset.csv', encoding='utf-8', warn_bad_lines=True).dropna()

    # checking if there's any NaN values
    isnull = [i for i in (df['text'].isnull()) if i == True]
    if isnull != []:
        sys.exit(0) # add response object here

    return df

In [17]:
df = preprocess_dataset(PATH_FILE='data/training.1600000.processed.noemoticon.csv')

preprocess the dataset...

load the dataset...

--2020-08-28 02:30:08--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2020-08-28 02:30:11 (37.9 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  
Database loaded



In [18]:
from emoji import demojize, emojize
# removing emojis
#df['text'] = df['text'].apply(lambda x: demojize(string=x))
#df_emojis = df['text'].apply(lambda x: re.findall(r':[a-z_]+:', string=demojize(x)))

In [20]:
#for n, i in enumerate(df.emojis):
#    if i != []:
#        print(n, emojize(str(i)))

In [21]:
test_csv = df.to_csv('checking_csv.csv', quotechar='"', encoding='utf-8')

In [25]:
'''
emo_dict = {}
for i, j in df_emojis.iteritems():
    for k in j:
        if k in emo_dict:
            emo_dict[k] += 1
        else:
            emo_dict[k] = 1

df_hashtags = df['text'].apply(lambda x: re.findall(r'#/S+', string=x))
hashtags = {}
for i, j in df_hashtags.iteritems(): 
    for k in j:
        if k in hashtags:
            hashtags[k] += 1
        else:
            hashtags[k] = 1
            
for i, c in sorted(emo_dict.items(), key=lambda x: x[1], reverse=True):
    print(emojize(i) + i + str(c))
'''

"\nemo_dict = {}\nfor i, j in df_emojis.iteritems():\n    for k in j:\n        if k in emo_dict:\n            emo_dict[k] += 1\n        else:\n            emo_dict[k] = 1\n\ndf_hashtags = df['text'].apply(lambda x: re.findall(r'#/S+', string=x))\nhashtags = {}\nfor i, j in df_hashtags.iteritems(): \n    for k in j:\n        if k in hashtags:\n            hashtags[k] += 1\n        else:\n            hashtags[k] = 1\n            \nfor i, c in sorted(emo_dict.items(), key=lambda x: x[1], reverse=True):\n    print(emojize(i) + i + str(c))\n"

In [26]:
def siaVader_compound(text):
    scores = SIA.polarity_scores(text)
    
    comp_score = scores['compound']
    if comp_score >= 0.05:
        str_comp = 'pos'
    elif comp_score <= -0.05:
        str_comp = 'neg'
    else: # (compound score > -0.05) and (compound score < 0.05)
        str_comp = 'neu'
    return str_comp

def siaVader_maxScore(text):
    scores = SIA.polarity_scores(text)
    
    del scores['compound']
    index = np.argmax(list(scores.values()))
    vader_MaxScore = list(scores.values())[index]
    vader_label = list(scores)[index]
    
    return vader_label

###
def siaVader_byWord(text):
    c = 0
    for n, y in enumerate(text):
        x = SIA.polarity_scores(y)
        if x['compound'] != 0.0:
            c += 1
            # print('{}. {} {}'.format(c, x, y))
            return 'pos' if x > 0.05 else 'neg'

In [None]:
def text_blob_subject(text):
    return TextBlob(text).sentiment

In [None]:
def flair_lstm(text):
    x = flair.data.Sentence(text)
    flair_sent.predict(sentences=x)
    total_sent = x.labels
    return total_sent 

In [28]:
# testing
df_test = pd.read_csv('emo_test.csv', sep='\t')
df_test['SIA'] = df_test['word'].apply(lambda x: siaVader_compound(x))
df_test['comp_val'] = df_test['word'].apply(lambda x: (SIA.polarity_scores(x))['compound'])
df_test['VADER_pos_neg'] = df_test['word'].apply(
    lambda x: SIA.polarity_scores(x)['pos'] if SIA.polarity_scores(x)['pos'] > SIA.polarity_scores(x)['neg'] else SIA.polarity_scores(x)['neg']
    )

In [None]:
df_test['flair'] = df_test['word'].apply(lambda x: flair_lstm(x))

In [30]:
df['comp_label'] = df['text'].apply(lambda x: siaVader_compound(data=x))

In [31]:
df['maxScore_label'] = df['text'].apply(lambda x: siaVader_maxScore(data=x))

In [32]:
df['comp_score'] = df['text'].apply(lambda x: SIA.polarity_scores(x)['compound'])

In [33]:
df['SIA'] = df['text'].apply(lambda x: SIA.polarity_scores(x))

In [None]:
df['flair'] = df['text'].apply(lambda x: flair_lstm(x))

In [None]:
df.tail(70)

In [None]:
df.to_csv('check_csv_2.csv', encoding='utf-8')