<a href="https://colab.research.google.com/github/aithaprasad/NLP_Sentiment_Analysis/blob/master/Sentimental_Analysis_any_approach_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

**Data Preparation Step**

In [None]:
# Import data from file, split data by tab and names the columns
data = pd.read_csv('train.tsv', delimiter="\t", header = None, names = ['label', 'sentence'])

In [None]:
# Look at the top 5 rows of data, label 0 is negative while label 1 is positive
data.head()

Unnamed: 0,label,sentence
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [None]:
# Check null value
data.isnull().values.any()

False

In [None]:
# Check data rows and columns
data.shape

(78609, 2)

In [None]:
# Define feature in X, Label in y
X, y = data['sentence'], data['label']

In [None]:
# Split data to train set 80% and test set 20%
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# Split test set to create validation set by 50%
X_dev, X_test, y_dev, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

X_train_tok = tokenizer.texts_to_sequences(X_train)
X_dev_tok = tokenizer.texts_to_sequences(X_dev)
X_test_tok = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_tok[60000]

[100, 60, 2777, 27, 2, 791, 2764, 5, 317]

In [None]:
for item in X_train_tok:
  if 50000 in item:
    print("True")

In [None]:
len(tokenizer.word_index)

61245

In [None]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train_pad = pad_sequences(X_train_tok, padding='post', maxlen=maxlen)
X_dev_pad = pad_sequences(X_dev_tok, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_tok, padding='post', maxlen=maxlen)

**Method 1 : K Nearest-Neighbor**

1.1 Perform K-fold cross-validation and hyper-parameter tuning on training data.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define n_neighbors in param_grid to run the GridSearchCV
n_neighbors = [int(x) for x in np.linspace(1, 15, num = 1)]
param_grid = {'n_neighbors':n_neighbors}

import time
start = time.time()

# Initial KNN model
estimator = KNeighborsClassifier()

# Run GridSearchCV with param_grid
knn_search = GridSearchCV(estimator, param_grid, cv=10, n_jobs = 1, verbose=1)
knn_search.fit(X_train_pad, y_train.astype(np.float))

# Estimate CV running time for each parameter
end = time.time()
time_duration = end-start
print("Parameter tuning finishes in {} seconds:".format(time_duration))

# estimated running time for 10-fold CV (weights:uniform) is ~55 sec
# Parameter tuning : estimated running time for 15-fold CV (weights:uniform) is ~61 sec
# So we choose 10-fold to process

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  app.launch_new_instance()


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Parameter tuning finishes in 51.17181062698364 seconds:


1.2 Select the best model derived and train the model on the training dataset

In [None]:
# Get the best model with best parameter cross validation
best_model_knn = knn_search.best_estimator_
best_model_knn.fit(X_train_pad, y_train)

KNeighborsClassifier(n_neighbors=1)

1.3 Predict the train data classes and Calculate the accuracy of the predictions

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score 

# Find predicted value from validation data
predicted_valid_label_knn = best_model_knn.predict(X_dev_pad)

# Evaluate on validation data
knn_val_acc = accuracy_score(y_dev, predicted_valid_label_knn)
print('Accuracy on Validation set {:.5f} ({:.3f}%)'.format(knn_val_acc, knn_val_acc*100))

Accuracy on Validation set 0.51037 (51.037%)


In [None]:
# Find predicted value from test data
predicted_test_label_knn = best_model_knn.predict(X_test_pad)

# Evaluate on test data
knn_test_acc = accuracy_score(y_test, predicted_test_label_knn)
print('Accuracy on Test set {:.5f} ({:.3f}%)'.format(knn_test_acc, knn_test_acc*100))

Accuracy on Test set 0.51418 (51.418%)


**Method 2 : Decision Tree**

2.1 .1 Perform K-fold cross-validation and hyper-parameter tuning on training data

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define parameters in param_grid to run the GridSearchCV
param_grid = {'max_depth': [3, 4, 5],
              'min_samples_split': [2, 4, 5, 7],
              'min_samples_leaf' : [1, 3, 4, 5],
              'max_features': ['auto', 'sqrt', 'log2']}

import time
start = time.time()

# Initial Decision Tree model
estimator = DecisionTreeClassifier(random_state = 1234)

grid_search_dt = GridSearchCV(estimator, param_grid, cv = 3, verbose = 3)
grid_search_dt.fit(X_train_pad, y_train)

# Estimate CV running time for each parameter
end = time.time()
time_duration = end-start
print('Best Hyper-parameter:', grid_search_dt.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.511 total time=   0.1s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.525 total time=   0.1s
[CV 3/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2;, score=0.526 total time=   0.1s
[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=4;, score=0.511 total time=   0.1s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=4;, score=0.525 total time=   0.1s
[CV 3/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=4;, score=0.526 total time=   0.1s
[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=5;, score=0.511 total time=   0.1s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=5;, score=0.525 total time= 

2.2 Select the best model derived and train the model on the training dataset

In [None]:
# Best Hyper-parameter: {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
best_model_dt = DecisionTreeClassifier(max_depth = 5, max_features = 'auto', 
                            min_samples_leaf = 1, min_samples_split = 2,
                            random_state = 1234)

In [None]:
best_model_dt.fit(X_train_pad, y_train)

DecisionTreeClassifier(max_depth=5, max_features='auto', random_state=1234)

3.3 Predict the train data classes and Calculate the accuracy of the predictions

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score 

# Find predicted value from validation data
predicted_valid_label_dt = best_model_dt.predict(X_dev_pad)

# Evaluate on validation data
dt_val_acc = accuracy_score(y_dev, predicted_valid_label_dt)
print('Accuracy on Validation set {:.5f} ({:.3f}%)'.format(dt_val_acc, dt_val_acc*100))

Accuracy on Validation set 0.58021 (58.021%)


In [None]:
# Find predicted value from test data
predicted_test_label_dt = best_model_dt.predict(X_test_pad)

# Evaluate on test data
dt_test_acc = accuracy_score(y_test, predicted_test_label_dt)
print('Accuracy on test set {:.5f} ({:.3f}%)'.format(dt_test_acc, dt_test_acc*100))

Accuracy on test set 0.57563 (57.563%)


**Method 3 : Support Vector Machine**

3.1 Perform K-fold cross-validation and hyper-parameter tuning on training data

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define C (Regularization parameter) in param_grid to run the GridSearchCV
param_grid = {'C': [1, 10]} #parameter tuning

# Initial SVM model
svc = SVC(kernel = 'rbf', probability = True, random_state = 1234)

# Run GridSearchCV with param_grid
grid_search_svc = GridSearchCV(estimator = svc, param_grid = param_grid, cv = 2, verbose = 2)
grid_search_svc.fit(X_train_pad, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ................................................C=1; total time=20.6min


3.2 Get the best Model

In [None]:
# Get the best model with best parameter cross validation
best_model_svc = grid_search_svc.best_estimator_
best_model_svc.fit(X_train_pad, y_train)

In [None]:
from sklearn.svm import SVC
svc = SVC(C = 10, kernel = 'rbf', probability = True, random_state = 1234).fit(X_train_pad, y_train)

3.3 Evaluate

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score 

# Find predicted value from validation data
predicted_valid_label_svc = svc.predict(X_dev_pad)
# Evaluate on validation data
svc_val_acc = accuracy_score(y_dev, predicted_valid_label_svc)
print('Accuracy on Validation set {:.5f} ({:.3f}%)'.format(svc_val_acc, svc_val_acc*100))

In [None]:
# Find predicted value from test data
predicted_test_label_svc = svc.predict(X_test_pad)
# Evaluate on test data
svc_test_acc = accuracy_score(y_test, predicted_test_label_svc)
print('Accuracy on Test set {:.5f} ({:.3f}%)'.format(svc_test_acc, svc_test_acc*100))

**References**


https://neptune.ai/blog/tokenization-in-nlp

CSCI5750 - Machine Learning Final Project (Nichapha Manoonwong)