## some documentation
- process slang:
    https://github.com/vi3k6i5/flashtext

In [1]:
!pip install nltk 
!pip install stanza
!pip install spacy
!spacy download en_core_web_sm # sm md lg
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
# nltk
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

STOPWORDS = set(stopwords.words('english'))
NLTK_WORDS = set(words.words())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# Stanza NLP
import stanza

stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
                      lang='en',
                      use_gpu=True)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 12.8MB/s]                    
2020-08-26 01:14:38 INFO: Downloading these customized packages for language: en (English)...
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| pretrain  | ewt     |

2020-08-26 01:14:39 INFO: File exists: /root/stanza_resources/en/tokenize/ewt.pt.
2020-08-26 01:14:39 INFO: File exists: /root/stanza_resources/en/pos/ewt.pt.
2020-08-26 01:14:39 INFO: File exists: /root/stanza_resources/en/lemma/ewt.pt.
2020-08-26 01:14:39 INFO: File exists: /root/stanza_resources/en/pretrain/ewt.pt.
2020-08-26 01:14:39 INFO: Finished downloading models and saved to /root/stanza_resources.
2020-08-26 01:14:39 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-0

In [4]:
# Spacy NLP
import spacy
spNLP = spacy.load('en_core_web_sm')
spNLP.max_length = 103950039 # or higher
# spacy.prefer_gpu() #will not work with stanza

In [5]:
def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)

In [6]:
# lemmatizion
# stanza
def stanza_lemma(text):
    doc = stNLP(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

In [7]:
# labels
highlights = {
    # related with speech recognition
    'professional qualities': ['handles pressure'],
    'soft skills': ['silence'],
    'answer analysis': ['filler words', 'long pause', 'focus', 'patience'], 

    'polarities': {
        'negative': [
                     # confidence
                     'not confident', 
                     'unsure',

                     # professional qualities
                     '',
                     'disordered',
                     'talkative',
                     'uninsterested', # 'engaged'

                     # soft skills
                     'sad',
                     'unfriendly'
                     ],

        'positive': [
                     # confidence
                     'confident', 
                     'certany',

                     # professional qualities
                     'handles pressure',
                     'organized',
                     'concise', 
                     'interested', # 'engaged'

                     # soft skills
                     'happy',
                     'friendly'
                     ]
    }
}

main_lst = list(highlights.values())
main_labels = [k for j in main_lst for k in j]

neg_pos_lst = highlights['polarities'].values()
neg_pos_labels = [k for j in neg_pos_lst for k in j]

In [8]:
neg_pos_lst

dict_values([['not confident', 'unsure', '', 'disordered', 'talkative', 'uninsterested', 'sad', 'unfriendly'], ['confident', 'certany', 'handles pressure', 'organized', 'concise', 'interested', 'happy', 'friendly']])

In [9]:
neg_pos_labels

['not confident',
 'unsure',
 '',
 'disordered',
 'talkative',
 'uninsterested',
 'sad',
 'unfriendly',
 'confident',
 'certany',
 'handles pressure',
 'organized',
 'concise',
 'interested',
 'happy',
 'friendly']

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import re

In [11]:
def load_data(data_path=None):
    print('load the dataset...\n')
    !mkdir -p data
    !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip -P data
    !unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

In [12]:
def preprocess_dataset(PATH_FILE, index_col=None):
    print('preprocess the dataset...\n')

    # load_data
    load_data()
    print('Database loaded\n')

    # cleaning data
    unclean_df = pd.read_csv(PATH_FILE,
                     names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                     encoding='latin-1') # if utf-8: UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 232719-232720: invalid continuation byte

    unclean_df.polarity = unclean_df.polarity.replace({0: 0, 4: 1}) # replace polarity
    unclean_df = unclean_df.drop(columns=['id', 'date', 'query', 'user']) # dropping unneeded columns

    # sample
    #df_sample = unclean_df.sample(n=500000)
    #df_sample.polarity.value_counts()

    # lower case
    unclean_df['text'] = unclean_df['text'].str.lower()

    # remove character and numbers
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'https://www\.|http:\.|https://|www\.', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil|cl)[\S]*\s?', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-zÁ-Úá-ú \t])|(\w+:\/\/\S+)|^rt|http.+?%', '', x))
    unclean_df['text'] = unclean_df['text'].apply(lambda x: re.sub(r'\d+', '', x))

    # rewritting the created file without NaN values
    unclean_df.to_csv('data/sentiment140-subset.csv', 
              quotechar='"', # check later!
              encoding='utf-8',
              index=False)

    # clean csv
    df = pd.read_csv('data/sentiment140-subset.csv', encoding='utf-8', warn_bad_lines=True).dropna()

    # checking if there's any NaN values
    isnull = [i for i in (df['text'].isnull()) if i == True]
    if isnull != []:
        sys.exit(0) # add response object here

    return df

In [13]:
df = preprocess_dataset(PATH_FILE='data/training.1600000.processed.noemoticon.csv')

preprocess the dataset...

load the dataset...

File ‘data/training.1600000.processed.noemoticon.csv.zip’ already there; not retrieving.

Archive:  data/training.1600000.processed.noemoticon.csv.zip
Database loaded



In [14]:
df.head(20)

Unnamed: 0,polarity,text
0,0,awww thats a bummer you shoulda got david c...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sa...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i he...
5,0,not the whole crew
6,0,need a hug
7,0,hey long time no see yes rains a bit only a ...
8,0,k nope they didnt have it
9,0,que me muera


In [15]:
# replacing empty rows with NaN; removing NaN
#df.replace(' ', value=float('NaN'), inplace=True).replace('', value=float('NaN'), inplace=True)
#df.dropna(subset=['text'], inplace=True)

In [16]:
# removing stopwords
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in (STOPWORDS)]))

# filtering and removing non-english words or misspelling
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i.lower() in NLTK_WORDS or not i.isalpha()]))

In [17]:
# VADER, has different ratings depending on the form of the word and therefore the input should not be stemmed or lemmatized.
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
SIA = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [23]:
def sia_vader(data, compound=False):
    scores = SIA.polarity_scores(data)
    
    if compound:
        comp_score = scores['compound']
        if comp_score >= 0.05:
            str_comp = 'pos'
        elif comp_score <= -0.05:
            str_comp = 'neg'
        else: # (compound score > -0.05) and (compound score < 0.05)
            str_comp = 'neu'
        return str_comp
    else:
        del scores['compound']

        index = np.argmax(list(scores.values()))
        vader_MaxScore = list(scores.values())[index]
        vader_label = list(scores)[index]

        return vader_label

In [25]:
df['compound_score'] = df['text'].apply(lambda x: sia_vader(data=x, compound=True))

In [26]:
df['vader'] = df['text'].apply(lambda x: sia_vader(x))

In [27]:
df['score'] = df['text'].apply(lambda x: SIA.polarity_scores(x)['compound'])

In [29]:
df['SIA'] = df['text'].apply(lambda x: SIA.polarity_scores(x))

In [35]:
df.SIA[0]

{'compound': -0.3818, 'neg': 0.342, 'neu': 0.658, 'pos': 0.0}

In [31]:
df.head(6)

Unnamed: 0,polarity,text,compound_score,vader,score,SIA
0,0,thats bummer got carr third day,neg,neu,-0.3818,"{'neg': 0.342, 'neu': 0.658, 'pos': 0.0, 'comp..."
1,0,upset cant update might cry result school toda...,neg,neu,-0.1144,"{'neg': 0.295, 'neu': 0.516, 'pos': 0.188, 'co..."
2,0,many times ball save rest go,pos,neu,0.4939,"{'neg': 0.0, 'neu': 0.61, 'pos': 0.39, 'compou..."
3,0,whole body itchy like fire,neg,neg,-0.25,"{'neg': 0.5, 'neu': 0.222, 'pos': 0.278, 'comp..."
4,0,mad cant see,neg,neg,-0.4939,"{'neg': 0.615, 'neu': 0.385, 'pos': 0.0, 'comp..."
5,0,whole crew,neu,neu,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
