# Sentiment Analysis and Classification Models

**Create machine learning models to predict sentiment score of the comments.**

In [None]:
# import necessary modules:

from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import pandas as pd 

data = pd.read_csv("train.tsv",sep = "\t")
data.head()

In [None]:
# label comments with sentiment score 0, 1 as "negative" and ones with sentiment score 3, 4 as "positive":

data["Sentiment"].replace(0, value = "negative", inplace = True)
data["Sentiment"].replace(1, value = "negative", inplace = True)

data["Sentiment"].replace(3, value = "positive", inplace = True)
data["Sentiment"].replace(4, value = "positive", inplace = True)

data.head()

In [None]:
# get rid of comments with sentiment score 2 for an exact binary system:

data = data[(data.Sentiment == "negative") | (data.Sentiment == "positive")]
data.head()

In [None]:
data.groupby("Sentiment").count()

In [None]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]
df.head()

## Text Preprocessing

In [None]:
# uppercase - lowercase conversion:
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# punctuations:
df['text'] = df['text'].str.replace('[^\w\s]','')

# numbers:
df['text'] = df['text'].str.replace('\d','')

# stopwords:
import nltk
#nltk.download ('stopwords') 
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

# deleting rare ones:
sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

# lemmatization:
from textblob import Word
import nltk
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

## Feature Engineering

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings

TF(t) = (frequency of observing t in the document) / (total term count in the document) 

IDF(t) = log_e(total document count / document count that containing t)


In [None]:
df.head()

In [None]:
df.iloc[0]

In [None]:
## Test - Train data:

train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"], 
                                                                    random_state = 1)

In [None]:
train_y[0:5]

In [None]:
encoder = preprocessing.LabelEncoder()

In [None]:
# convert string to numeric for transform train_y and test_y:

train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [None]:
train_y[0:5]

In [None]:
test_y[0:5]

### Count Vectors

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

In [None]:
# transform independent variables; train_x and test_x

x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [None]:
vectorizer.get_feature_names()[0:5]

In [None]:
x_train_count.toarray()

### TF-IDF

#### wordlevel tf-idf

In [None]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

In [None]:
# transform using tf-idf:

x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [None]:
tf_idf_word_vectorizer.get_feature_names()[0:5]

In [None]:
x_train_tf_idf_word.toarray()

#### ngram level tf-idf

In [None]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

In [None]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

#### characters level tf-idf

In [None]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(train_x)

In [None]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

## Sentiment Classification with Machine Learning

### Logistic Regression

In [None]:
log = linear_model.LogisticRegression()
log_model = log.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(log_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy:", accuracy)

In [None]:
log = linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(log_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

In [None]:
log = linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(log_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)

In [None]:
log = linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(log_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy:", accuracy)

### Naive Bayes

In [None]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy:", accuracy)

In [None]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

In [None]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)

In [None]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy:", accuracy)

### Random Forests

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy:", accuracy)

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy:", accuracy)

### XGBoost

In [None]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Accuracy:", accuracy)

In [None]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Accuracy:", accuracy)

In [None]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Accuracy:", accuracy)

In [None]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Accuracy:", accuracy)

**Choose a model; I choose logistic regression:**

In [None]:
log_model

In [None]:
new_comment = pd.Series("this film is very nice i like it")

In [None]:
v = CountVectorizer()
v.fit(train_x)
new_comment = v.transform(new_comment)

In [None]:
log_model.predict(new_comment)