## some documentation
- process slang:
    https://github.com/vi3k6i5/flashtext

In [1]:
!pip install nltk 
!pip install stanza
!pip install spacy
!spacy download en_core_web_sm # sm md lg
!python -m spacy download en

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |████████████████████████████████| 235kB 2.7MB/s 
Installing collected packages: stanza
Successfully installed stanza-1.1.1
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
# nltk
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

STOPWORDS = set(stopwords.words('english'))
NLTK_WORDS = set(words.words())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [3]:
# Stanza NLP
import stanza

stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
                      lang='en',
                      use_gpu=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 10.0MB/s]                    
2020-08-25 17:31:47 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| pretrain  | ewt     |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/tokenize/ewt.pt: 100%|██████████| 631k/631k [00:01<00:00, 361kB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/pos/ewt.pt: 100%|██████████| 22.1M/22.1M [00:07<00:00, 3.10MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/lemma/ewt.pt: 100%|██████████| 3.36M/3.36M [00:04<00:00, 693kB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/pretrain/ewt.pt: 100%|██████████| 156M/156M [00:57<00:00, 2.72MB/s]
2020-08-25 17:33:04 INFO: Finished downloading models and saved to /root/stanza_resources.
2020-08-25 1

In [4]:
# Spacy NLP
import spacy
spNLP = spacy.load('en_core_web_sm')
spNLP.max_length = 103950039 # or higher
# spacy.prefer_gpu() #will not work with stanza

In [5]:
def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)

In [6]:
# lemmatizion
# stanza
def stanza_lemma(text):
    doc = stNLP(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

In [7]:
# labels
highlights = {
    # related with speech recognition
    'professional qualities': ['handles pressure'],
    'soft skills': ['silence'],
    'answer analysis': ['filler words', 'long pause', 'focus', 'patience'], 

    'polarities': {
        'negative': [
                     # confidence
                     'not confident', 
                     'unsure',

                     # professional qualities
                     '',
                     'disordered',
                     'talkative',
                     'uninsterested', # 'engaged'

                     # soft skills
                     'sad',
                     'unfriendly'
                     ],

        'positive': [
                     # confidence
                     'confident', 
                     'certany',

                     # professional qualities
                     'handles pressure',
                     'organized',
                     'concise', 
                     'interested', # 'engaged'

                     # soft skills
                     'happy',
                     'friendly'
                     ]
    }
}

main_lst = list(highlights.values())
main_labels = [k for j in main_lst for k in j]

neg_pos_lst = highlights['polarities'].values()
neg_pos_labels = [k for j in neg_pos_lst for k in j]

In [8]:
neg_pos_lst

dict_values([['not confident', 'unsure', '', 'disordered', 'talkative', 'uninsterested', 'sad', 'unfriendly'], ['confident', 'certany', 'handles pressure', 'organized', 'concise', 'interested', 'happy', 'friendly']])

In [9]:
neg_pos_labels

['not confident',
 'unsure',
 '',
 'disordered',
 'talkative',
 'uninsterested',
 'sad',
 'unfriendly',
 'confident',
 'certany',
 'handles pressure',
 'organized',
 'concise',
 'interested',
 'happy',
 'friendly']

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import re

In [11]:
def load_data(data_path=None):
    print('load the dataset...\n')
    !mkdir -p data
    !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
    !unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

In [12]:
def preprocess_dataset(PATH_FILE, index_col=None):
    print('preprocess the dataset...\n')

    # load_data
    load_data()
    print('Database loaded\n')

    # cleaning data
    unclean_df = pd.read_csv(PATH_FILE,
                     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                     encoding='latin-1') # if utf-8: UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 232719-232720: invalid continuation byte

    unclean_df.polarity = unclean_df.polarity.replace({0: 0, 4: 1}) # replace polarity
    unclean_df = unclean_df.drop(columns=['id', 'date', 'query', 'user']) # dropping unneeded columns

    # sample
    #df_sample = unclean_df.sample(n=500000)
    #df_sample.polarity.value_counts()

    # lower case
    unclean_df['text'] = unclean_df['text'].str.lower()

    # remove character and numbers
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'https://www\.|http:\.|https://|www\.', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil|cl)[\S]*\s?', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-zÁ-Úá-ú \t])|(\w+:\/\/\S+)|^rt|http.+?%', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # rewritting the created file without NaN values
    unclean_df.to_csv('data/sentiment140-subset.csv', 
              quotechar='"', # check later!
              encoding='utf-8',
              index=False)

    # clean csv
    df = pd.read_csv('data/sentiment140-subset.csv', encoding='utf-8', warn_bad_lines=True).dropna()

    # checking if there's any NaN values
    isnull = [i for i in (df['text'].isnull()) if i == True]
    if isnull != []:
        sys.exit(0) # add response object here
    
    return df

In [13]:
df = preprocess_dataset(PATH_FILE='data/training.1600000.processed.noemoticon.csv')

preprocess the dataset...

load the dataset...

--2020-08-25 17:33:16--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2020-08-25 17:33:25 (10.2 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  
Database loaded



In [15]:
# getting stopwords
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in (STOPWORDS)]))

# filtering non-english words or misspelling
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i.lower() in NLTK_WORDS or not i.isalpha()]))

In [None]:
# VADER, has different ratings depending on the form of the word and therefore the input should not be stemmed or lemmatized.
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [18]:
# STOPWORDS
# Getting in a list all the stopwords of the dataframe
#spacy_stop_words = list(dict.fromkeys([str(i) for i in spNLP(' '.join(j for j in df['text'])) if i.is_stop == True]))

'''

df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop_words]))

# Lemmatization Stanza vs NLTK
df['text'] = df['text'].apply(lambda x: stanza_lemma(x))
df['text'] = df['text'].apply(lambda x: nltk_lemma(x))
# check new stopwords here!
df['text'] = df['text'].apply(lambda x: ' '.join(
    [i for i in x.split() if i not in stop_words]
))
'''

"\n\ndf['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop_words]))\n\n# Lemmatization Stanza vs NLTK\ndf['text'] = df['text'].apply(lambda x: stanza_lemma(x))\ndf['text'] = df['text'].apply(lambda x: nltk_lemma(x))\n# check new stopwords here!\ndf['text'] = df['text'].apply(lambda x: ' '.join(\n    [i for i in x.split() if i not in stop_words]\n))\n"