### **Sentiment Classification**

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
import random
from nltk.stem.snowball import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import re # for regular expressions

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("sentiment_train.csv", delimiter="\t")
data.head()


Unnamed: 0,"sentiment,text"
0,"1,The Da Vinci Code book is just awesome."
1,"1,this was the first clive cussler i've ever r..."
2,"1,i liked the Da Vinci Code a lot."
3,"1,i liked the Da Vinci Code a lot."
4,"1,I liked the Da Vinci Code but it ultimatly d..."


In [20]:
plt.figure(figsize=(6, 5))
ax = sn.countplot(x="sentiment", data=data)

ValueError: Could not interpret value `sentiment` for `x`. An entry with this name does not appear in `data`.

<Figure size 600x500 with 0 Axes>

In [20]:
for p in ax.patches:
  ax.annotate(p.get_height(), (p.get_x()+0.1,
  p.get_height()+50))

### **Count Vectors Model**

In [24]:
count_vectorizer = CountVectorizer()
feature_vector = count_vectorizer.fit(data.text)
features = feature_vector.get_feature_names()
print("Total number of features", len(features))

In [None]:
random.sample(features, 10)
data_features = count_vectorizer.transform(data.text)
type(data_features)
data_features.shape

In [None]:
data_features.getnnz()
print("Density of matrix: ", data_features.getnnz() * 100 / (data_features.shape[0] * data_features.shape[1]))

In [None]:
data_df = pd.DataFrame(data_features.todense())
data_df.columns = features

### **Removing Low-frequency Words**

In [None]:
features_counts = np.sum(data_features.toarray(), axis=0)
feat_count_df = pd.DataFrame(dict(features=features, counts=features_counts))

plt.figure(figsize=(12, 5))
plt.hist(feat_count_df.counts, bins=50, range=(0, 2000))
plt.xlabel("Frequency of words")
plt.ylabel("Density")

In [27]:
len(feat_count_df[feat_count_df.counts == 1])


NameError: name 'feat_count_df' is not defined

In [None]:
# Initialize the CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
feature_vector = count_vectorizer.fit(data.text)
features = feature_vector.get_feature_names()
train_ds_features = count_vectorizer.transform(data.text)

features_counts = np.sum(train_ds_features.toarray(), axis = 0)
feature_counts = pd.DataFrame(dict(features = features,
counts = features_counts))
feature_counts.sort_values('counts', ascending = False)[0:15]

In [None]:
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS
#Printing first few stop words
print("Few stop words: ", list(my_stop_words)[0:10])

In [None]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(['harry', 'potter',
'code', 'vinci', 'da','harry', 'mountain', 'movie', 'movies'])

### **Creating Count Vectors**

In [None]:
count_vectorizer = CountVectorizer(stop_words = my_stop_words, max_features = 1000)
feature_vector = count_vectorizer.fit(data.text)

train_ds_features = count_vectorizer.transform(data.text)
features = feature_vector.get_feature_names()
features_counts = np.sum(train_ds_features.toarray(), axis = 0)
feature_counts = pd.DataFrame(dict(features = features, counts = features_counts))
feature_counts.sort_values("counts", ascending = False)[0:15]

In [None]:
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
  stemmed_words = [stemmer.stem(w) for w in analyzer(doc)]
  non_stop_words = [word for word in stemmed_words if not in my_stop_words]
  return non_stop_words

count_vectorizer = CountVectorizer(analyzer=stemmed_words, max_features = 1000)
feature_vector = count_vectorizer.fit(data.text)
train_ds_features = count_vectorizer.transform(data.text)
features = feature_vector.get_feature_names()
features_counts = np.sum(train_ds_features.toarray(), axis = 0)
feature_counts = pd.DataFrame(dict(features = features, counts = features_counts))
feature_counts.sort_values("counts", ascending = False)[0:15]

In [None]:
train_ds_df = pd.DataFrame(train_ds_features.todense())
train_ds_df.columns = features
train_ds_df['sentiment'] = data.sentiment

sn.barplot(x = 'sentiment', y = 'awesome', data = train_ds_df, estimator=sum);

In [None]:
sn.barplot(x = 'sentiment', y = 'realli', data = train_ds_df, estimator=sum)

In [None]:
sn.barplot(x = 'sentiment', y = 'hate', data = train_ds_df,
estimator=sum)

### **Naive-Bayes**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.features, data.sentiment, test_size=0.3, random_state=42)

nb_clf = BernoulliNB()
nb_clf.fit(x_train.toarray(), y_train)

test_ds_predicted = nb_clf.predict(x_test.toarray())
print(metrics.classification_report(y_test, test_ds_predicted))

In [None]:
cm = metrics.confusion_matrix(y_test, test_ds_predicted)
sn.heatmap(cm, annot=True, fmt='.2f')

### **TF-IDF Vectorizer**

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer=stemmed_words, max_features = 1000)
feature_vector = tfidf_vectorizer.fit(data.text)
train_ds_features = tfidf_vectorizer.transform(data.text)
features = feature_vector.get_feature_names()

### **Guassian Naive Bayes**

In [None]:
train_X, test_X, train_y, test_y = train_test_split(train_ds_features, data.sentiment, test_size = 0.3, random_state = 42)

nb_clf = GaussianNB()
nb_clf.fit(train_X.toarray(), train_y)

test_ds_predicted = nb_clf.predict(test_X.toarray())
print(metrics.classification_report(test_y, test_ds_predicted))

### **n-Grams**

In [None]:
stemmer = PorterStemmer()

def get_stemmed_tokens(doc):
  all_tokens = [word for word in nltk.word_tokenize(doc)]
  clean_tokens = []

In [None]:
for each_token in all_tokens:
  if re.search('[a-zA-Z]', each_token):
  clean_tokens.append(each_token)
  stemmed_tokens = [stemmer.stem(t) for t in clean_tokens]
  return stemmed_tokens

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=500,
stop_words='english',
tokenizer=get_stemmed_okens,
ngram_range=(1,2))
feature_vector = tfidf_vectorizer.fit(data.text)
train_ds_features = tfidf_vectorizer.transform(data.text)
features = feature_vector.get_feature_names()

In [None]:
train_X, test_X, train_y, test_y = train_test_split(train_ds_features, data.sentiment, test_size = 0.3, random_state = 42)
nb_clf = BernoulliNB()
nb_clf.fit(train_X.toarray(), train_y)
tst_ds_predicted = nb_clf.predict(test_X.toarray())
print(metrics.classification_report(test_y, test_ds_predicted))