---
# **Sentiment Analysis**
---

In [1]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/90/86/c3dcb600b4f9e7584ed90ea9d30a717fb5c0111574675f442c3e7bc19535/catboost-0.24.1-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.1MB 43kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.1


In [2]:
! pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |█▎                              | 10kB 16.6MB/s eta 0:00:01[K     |██▌                             | 20kB 6.6MB/s eta 0:00:01[K     |███▊                            | 30kB 7.6MB/s eta 0:00:01[K     |█████                           | 40kB 8.4MB/s eta 0:00:01[K     |██████▎                         | 51kB 7.5MB/s eta 0:00:01[K     |███████▌                        | 61kB 8.5MB/s eta 0:00:01[K     |████████▊                       | 71kB 8.7MB/s eta 0:00:01[K     |██████████                      | 81kB 9.0MB/s eta 0:00:01[K     |███████████▏                    | 92kB 8.4MB/s eta 0:00:01[K     |████████████▌                   | 102kB 8.7MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 8.7MB/s eta 0:00:01[K     |███████████████                 | 122kB 8.7MB/s eta 0:

## ii. Importing modules


In [3]:
# Data analysis
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import re

%matplotlib inline

# Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # to create a Word Cloud
from PIL import Image # Pillow with WordCloud to image manipulation

#embeddings 
import glove

#models and metrics
from sklearn.model_selection import *
from sklearn.metrics import *
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

---
# **4. Parameters**
---

In [4]:
# dimension of our embedding
EMBEDDING_DIM = 50

# features needed from the dataset
features = ["Valence","Arousal","Dominance","quadrant"]

# classes
main_labels = ['confident', 'unconfident', 
               'can handle pressure', 'cannot handle pressure', 
               'interested', 'uninterested', 
               'happy', 'unhappy', 
               'friendly', 'unfriendly']
               
label_dict = dict(zip(main_labels, range(1, len(main_labels) + 1)))

# inverting label_dict
inv_label = {v: k for k, v in label_dict.items()}

---
# **4. Load dataset**
---

In [5]:
def load_clean_dataset():
    """
    function that loads the dataframe

    return : df -> dataframe 
    """
    !mkdir -p datasets
    !wget -nc https://raw.githubusercontent.com/Y4rd13/sentiment-analysis/master/datasets/results/categories_dataset.csv -P datasets
    df = pd.read_csv('./datasets/categories_dataset.csv', encoding='utf-8', index_col=0, dtype=({'score':float}))

    return df

In [6]:
df = load_clean_dataset()

--2020-10-04 21:08:03--  https://raw.githubusercontent.com/Y4rd13/sentiment-analysis/master/datasets/results/categories_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7464477 (7.1M) [text/plain]
Saving to: ‘datasets/categories_dataset.csv’


2020-10-04 21:08:04 (37.4 MB/s) - ‘datasets/categories_dataset.csv’ saved [7464477/7464477]



Loading the Glove Embeddings

In [7]:
import os, requests, shutil

glove_dir = './data/RNN/'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)

# These are temporary files if we need to download it from the original source (slow)
data_cache = './data/cache'
glove_full_tar = 'glove.6B.zip'
glove_full_50d = 'glove.6B.50d.txt'

#force_download_from_original=False
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+glove_100k_50d
original_url = 'http://nlp.stanford.edu/data/'+glove_full_tar

if not os.path.isfile( glove_100k_50d_path ):
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)
    
    # First, try to download a pre-prepared file directly...
    response = requests.get(download_url, stream=True)
    if response.status_code == requests.codes.ok:
        print("Downloading 42Mb pre-prepared GloVE file from RedCatLabs")
        with open(glove_100k_50d_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
    else:
        # But, for some reason, RedCatLabs didn't give us the file directly
        if not os.path.exists(data_cache):
            os.makedirs(data_cache)
        
        if not os.path.isfile( os.path.join(data_cache, glove_full_50d) ):
            zipfilepath = os.path.join(data_cache, glove_full_tar)
            if not os.path.isfile( zipfilepath ):
                print("Downloading 860Mb GloVE file from Stanford")
                response = requests.get(download_url, stream=True)
                with open(zipfilepath, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
            if os.path.isfile(zipfilepath):
                print("Unpacking 50d GloVE file from zip")
                import zipfile
                zipfile.ZipFile(zipfilepath, 'r').extract(glove_full_50d, data_cache)

        with open(os.path.join(data_cache, glove_full_50d), 'rt') as in_file:
            with open(glove_100k_50d_path, 'wt') as out_file:
                print("Reducing 50d GloVE file to first 100k words")
                for i, l in enumerate(in_file.readlines()):
                    if i>=100000: break
                    out_file.write(l)
    
        # Get rid of tarfile source (the required text file itself will remain)
        #os.unlink(zipfilepath)
        #os.unlink(os.path.join(data_cache, glove_full_50d))

print("GloVE available locally")

Downloading 42Mb pre-prepared GloVE file from RedCatLabs
GloVE available locally


In [8]:
# Due to size constraints, only use the first 100k vectors (i.e. 100k most frequently used words)
word_embedding = glove.Glove.load_stanford( glove_100k_50d_path )
word_embedding.word_vectors.shape

(100000, 50)

In [9]:
def get_embedding_vec(word):
    """
    return : embedding vector of a word
    """
    idx = word_embedding.dictionary.get(word.lower(), -1)
    if idx<0:
        #print("Missing word : '%s'" % (word,))
        return np.zeros(  (EMBEDDING_DIM, ), dtype='float32')  # UNK
    return word_embedding.word_vectors[idx]

In [10]:
def prepare_x_y(df,features):
  """
  function that creates X (features) and Y (labels)

  return : x,y
  """
  embedding_features =[]
  for word in df["word"]:
    embedding_features.append(get_embedding_vec(word))

  feats_without_embedding = df[features]
  y = np.array(list(map(lambda x: label_dict[x.replace(' ', '_')], df["category"])))

  x = np.concatenate((feats_without_embedding,embedding_features),axis=1)

  return x, y

---
# **4. Training**
---

Preparation of X (features) and Y (labels) 

In [11]:
x, y = prepare_x_y(df,features)

We need to install catboost

In [12]:
oof_pred = np.zeros((len(x), 1))
y_pred = np.zeros((len(y), 1))
acc_score = []
n_splits = 5
i = 1
skf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Using Kfold as our crossvalidation, and catboost as our training model
for fold, (tr_ind, val_ind) in enumerate(skf.split(x, y)):

    X_train, X_val = x[tr_ind], x[val_ind]
    y_train, y_val = y[tr_ind], y[val_ind]

    model = CatBoostClassifier(n_estimators=100, random_state=42, silent=True, task_type="GPU", )
    model.fit(X_train,y_train,eval_set=(X_val,y_val))

    val_pred = model.predict(X_val)
    print('validation Accuracy fold-', fold + 1, ': ', accuracy_score(y_val, val_pred))

    acc_score.append(accuracy_score(y_val, val_pred))
    oof_pred[val_ind] = val_pred

    model.save_model(f"fold{i}_catboost")
    #here predict on your test dataset
    #y_pred += model.predict(test_df) / (n_splits)
    i+=1

print("Mean Accuracy = ", np.mean(acc_score,0))

validation Accuracy fold- 1 :  0.7689695485163948
validation Accuracy fold- 2 :  0.7760953070199855
validation Accuracy fold- 3 :  0.7724210877915716
validation Accuracy fold- 4 :  0.7708066581306018
validation Accuracy fold- 5 :  0.7769303568446251
Mean Accuracy =  0.7730445916606359


---
# **4. Predicting**
---

In [13]:
def predict_class(word,valence,arousal,dominance,quadrant):
  """
  a function that predicts sentiments from words

  ARGUMENTS : 

  word : our word (exple happy, tolerant ..)

  valence,arousal,dominance,quadrant : important features that boost the accuracy of the model

  return : class predicted

  """

  #creating the embedding of the word 
  embedding_features = get_embedding_vec(word)

  feats_without_embedding = [valence, arousal, dominance, quadrant]

  #creating our X that contains all features (word_embeddings, VADQ)
  x = np.concatenate((feats_without_embedding,embedding_features))
  
  #predictions (we will take saved models of each fold and do an average of predictions)
  preds =[]
  for i in range(1,n_splits+1):
    model = CatBoostClassifier() 
    model.load_model(f'fold{i}_catboost')
    predictions = model.predict_proba(x)
    preds.append(predictions)
  
  final_preds = np.mean(preds,0)

  return inv_label[np.argmax(final_preds) + 1]

In [28]:
predict_class("kill", 0.975, 0.001, 0.201, 1.0)

'unfriendly'