# Sentiment Classification

In [11]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text,sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [12]:
# conda install -c conda-forge xgboost
import pandas as pd
data= pd.read_csv("train.tsv",sep="\t")

### Sentiment Scoring

In [17]:
data.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,Negative
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [18]:
data.Sentiment.unique()

array(['Negative', 2, 'Positive'], dtype=object)

In [19]:
# We'll make binary classification >> positive(3-4) - negative(1-0)

In [22]:
data["Sentiment"].replace(0,value= "Negative", inplace= True)
data["Sentiment"].replace(1,value= "Negative", inplace= True)
data["Sentiment"].replace(3,value= "Positive", inplace= True)
data["Sentiment"].replace(4,value= "Positive", inplace= True)

In [23]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,Negative
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [24]:
# To divide classes better, we'll leave class "2" out
data = data[(data.Sentiment=="Negative")|(data.Sentiment=="Positive")]

In [25]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,Negative
21,22,1,good for the goose,Positive
22,23,1,good,Positive
33,34,1,"the gander , some of which occasionally amuses...",Negative
46,47,1,amuses,Positive


In [26]:
data.Sentiment.unique()

array(['Negative', 'Positive'], dtype=object)

In [27]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,34345,34345,34345
Positive,42133,42133,42133


#### Preprocessing

In [28]:
df = pd.DataFrame()
df["text"]= data["Phrase"]
df["label"]= data["Sentiment"]

In [29]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,Negative
21,good for the goose,Positive
22,good,Positive
33,"the gander , some of which occasionally amuses...",Negative
46,amuses,Positive


In [31]:
# Big-Small Transmission
df["text"]= df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Punctuation Transmission
df["text"]= df["text"].str.replace("[^\w\s]",'')
# Numbers
df["text"]= df["text"].str.replace("\d",'')
# Stopwords
import nltk
nltk.download('wordnet')
# nltk.download("stopwords")
from nltk.corpus import stopwords
sw= stopwords.words("english")
df["text"]= df["text"].apply(lambda x: " ".join(x.lower() for x in x.split() if x not in sw))
# Delete low frequences
delete = pd.Series(' '.join(df["text"]).split()).value_counts()[-1000:]
df["text"]= df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()if x not in delete))
# Lemmitization
from textblob import Word
# nltk.download("wordnet")
df["text"]= df["text"].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


### Feature Engineering : Feature Production

- Count Vectors : it takes unique values as columns
- TF-IDF Vectors(words,characters,n-grams)
- Word Embeddings: Density vectors for words. İt is trainable.

TF(t) = (Observation frequency of a "t" term in a document) / (All term numbers in a document) - Term Frequency (Standartization)

IDF(t) = log_e(All documents number) / (Document numbers which has t term inside) 

Corpus: Data Structure

Document: Each Row

#### Test-Train

In [32]:
train_x,test_x,train_y,test_y = model_selection.train_test_split(df["text"],df["label"],random_state=1)

In [33]:
train_x.head()

118788    present fascinating glimpse urban life class w...
89514     hey everybody wanna watch movie guy dressed ch...
86857                               incredible number story
140626                         ultimately disappoint action
153243                                              unified
Name: text, dtype: object

In [34]:
# Transmission to independent variable

encoder = preprocessing.LabelEncoder()
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [35]:
train_y[:5]

array([1, 0, 1, 0, 1])

In [36]:
test_y[:5]

array([1, 0, 1, 0, 0])

#### Count Vectors : Each row is a comment. Each column is frequency of words in all documents. It gives us an sparse matrix

In [37]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [38]:
x_train_count = vectorizer.transform(train_x)
x_test_count= vectorizer.transform(test_x)

In [39]:
vectorizer.get_feature_names()[:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

In [40]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### TF-IDF

#### word level

In [41]:
tf_idf_word_vectorizer = TfidfVectorizer() # Word level
tf_idf_word_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [42]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [43]:
tf_idf_word_vectorizer.get_feature_names()[:5]

['aaliyah', 'abagnale', 'abandon', 'abandoned', 'abbass']

In [44]:
x_train_tf_idf_word.toarray() # Burada float geliyor!!!!

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### n-gram level tf-idf

In [45]:
n_gram_vectorizer= TfidfVectorizer(ngram_range=(2,3))
n_gram_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(2, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [46]:
x_train_tf_idf_ngram = n_gram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = n_gram_vectorizer.transform(test_x)

#### Character Level tf-idf

In [47]:
chars_vectorizer= TfidfVectorizer(analyzer="char",ngram_range=(2,3))
chars_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(2, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [48]:
x_train_tf_idf_chars = chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = chars_vectorizer.transform(test_x)

# Sentiment Classification with Machine Learning

## Logistic Regression

In [49]:
log= linear_model.LogisticRegression()
log_model = log.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(log_model,x_test_count,test_y,cv=10).mean()

print("Count Vectors Accuracy Rate: {}".format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Count Vectors Accuracy Rate: 0.8369769874476987


In [50]:
log= linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(log_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv=10).mean()

print("Word level tf-idf Accuracy Rate: {}".format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Word level tf-idf Accuracy Rate: 0.8338389121338912


In [51]:
log= linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(log_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv=10).mean()

print("Ngram Accuracy Rate: {}".format(accuracy))

Ngram Accuracy Rate: 0.7483786610878662


In [52]:
log= linear_model.LogisticRegression()
log_model = log.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(log_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv=10).mean()

print("Character tf-idf Accuracy Rate: {}".format(accuracy))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Character tf-idf Accuracy Rate: 0.7811715481171547


In [53]:
print("Count Vectors Accuracy Rate: 0.8367167670488159\nWord level tf-idf Accuracy Rate: 0.8327420082078296\nNgram Accuracy Rate: 0.74822130516206166\nCharacter tf-idf Accuracy Rate: 0.7805420094890143")

Count Vectors Accuracy Rate: 0.8367167670488159
Word level tf-idf Accuracy Rate: 0.8327420082078296
Ngram Accuracy Rate: 0.74822130516206166
Character tf-idf Accuracy Rate: 0.7805420094890143


##### The highest accuracy is in Count Vectors for Logistic Regression

# Naive Bayes

In [83]:
nb= naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_count,
                                           test_y,
                                           cv=10).mean()

print("Count Vectors Accuracy Rate: {}".format(accuracy))

Count Vectors Accuracy Rate: 0.8328461453102637


In [84]:
nb= naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv=10).mean()

print("Word Level Accuracy Rate: {}".format(accuracy))

Word Level Accuracy Rate: 0.8345721692104104


In [85]:
nb= naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv=10).mean()

print("Ngram TF-IDF Accuracy Rate: {}".format(accuracy))

Ngram TF-IDF Accuracy Rate: 0.7685661961697254


In [87]:
nb= naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv=10).mean()

print("Character Level Accuracy Rate: {}".format(accuracy))

Character Level Accuracy Rate: 0.7559629594818172


In [91]:
print("NAIVE BAYES ACCURACY RATES\nCount Vectors Accuracy Rate: 0.8328461453102637\nWord Level Accuracy Rate: 0.8345721692104104\nNgram TF-IDF Accuracy Rate: 0.7685661961697254\nCharacter Level Accuracy Rate: 0.7559629594818172")

NAIVE BAYES ACCURACY RATES
Count Vectors Accuracy Rate: 0.8328461453102637
Word Level Accuracy Rate: 0.8345721692104104
Ngram TF-IDF Accuracy Rate: 0.7685661961697254
Character Level Accuracy Rate: 0.7559629594818172


## RANDOM FORESTS

In [92]:
rf = ensemble.RandomForestClassifier()
rf_model= rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_count,
                                           test_y,
                                           cv=10).mean()

print("Count Vectors Accuracy Rate: {}".format(accuracy))



Count Vectors Accuracy Rate: 0.8123958866751362


In [93]:
rf = ensemble.RandomForestClassifier()
rf_model= rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv=10).mean()

print("Word Level Accuracy Rate: {}".format(accuracy))



Word Level Accuracy Rate: 0.8100929897053583


In [94]:
rf = ensemble.RandomForestClassifier()
rf_model= rf.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv=10).mean()

print("Count Vectors Accuracy Rate: {}".format(accuracy))



Count Vectors Accuracy Rate: 0.7466005678997123


In [96]:
rf = ensemble.RandomForestClassifier()
rf_model= rf.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv=10).mean()

print("Character Level Accuracy Rate: {}".format(accuracy))



Character Level Accuracy Rate: 0.76736263853864


In [104]:
print("RANDOM FORESTS ACCURACY SCORES\nCount Vectors Accuracy Rate: 0.8123958866751362\nWord Level Accuracy Rate: 0.8100929897053583\n0.7466005678997123\n0.76736263853864")

TypeError: 'XGBClassifier' object is not callable

## XGBOOST

In [None]:
xgb = xgboost.XGBClasifier()
xgb_model= rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv=10).mean()

print("Count Vectors Accuracy Rate: {}".format(accuracy))

In [None]:
xgb = xgboost.XGBClassifier()
xgb_model= rf.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv=10).mean()

print("Word Level Accuracy Rate: {}".format(accuracy))

In [None]:
print = xgboost.XGBClassifier()
xgb_model= rf.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv=10).mean()

print("Character Level Accuracy Rate: {}".format(accuracy))

#### Let's get Logistic - Count Vectors 

In [107]:
log_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [109]:
### log_model.predict("yes i like this film") >> it doesn't work!!!

In [123]:
new_comment = pd.Series("this film is very nice and good i like it") # Positive

In [124]:
v = CountVectorizer()
v.fit(train_x)
new_comment= v.transform(new_comment)

In [125]:
log_model.predict(new_comment)

array([1])

In [126]:
new_comment = pd.Series("no not good look at that shit very bad") # Negative

In [127]:
v = CountVectorizer()
v.fit(train_x)
new_comment= v.transform(new_comment)
log_model.predict(new_comment)

array([0])

In [128]:
new_comment = pd.Series("maybe it is good maybe it is bad") # Negative

In [129]:
v = CountVectorizer()
v.fit(train_x)
new_comment= v.transform(new_comment)
log_model.predict(new_comment)

array([0])