In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
dataset = pd.read_csv('twitter_sentiment_data.csv', encoding="ISO-8859-1", names=DATASET_COLUMNS)

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset = dataset.drop(columns=["ids", "date", "flag", "user"])
dataset.head()

In [None]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
%%time
dataset.target = dataset.target.apply(lambda x: decode_sentiment(x))

In [None]:
dataset.head()

In [None]:
dataset.groupby(['target']).count()

In [None]:
target_cnt = Counter(dataset.target)

plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition")
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

In [None]:
# Not necessary after first run

import re

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"


def preprocess(text, stem=False):
    # Remove link,user and special characters
    negations = re.sub("n't", "not", str(text).lower())
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
dataset.text = dataset.text.apply(lambda x: preprocess(x))

In [None]:
neg_tweets = dataset[dataset['target']=="NEGATIVE"]
pos_tweets = dataset[dataset['target']=="POSITIVE"]

In [None]:
print(neg_tweets, pos_tweets)

In [None]:
neg = neg_tweets.text.str.split(expand=True).stack().value_counts()
pos = pos_tweets.text.str.split(expand=True).stack().value_counts()

values_neg = neg.keys().tolist()
counts_neg = neg.tolist()

values_pos = pos.keys().tolist()
counts_pos = pos.tolist()

plt.bar(values_neg[0:10], counts_neg[0:10])
plt.title('Top 10 Negative Words')
plt.show()


plt.bar(values_pos[0:10], counts_pos[0:10])
plt.title('Top 10 Positive Words')

plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

cv = CountVectorizer(stop_words='english', binary=False, ngram_range=(1,1))

neg_cv = cv.fit_transform(neg_tweets['text'].tolist())
pos_cv = cv.fit_transform(pos_tweets['text'].tolist())

freqs_neg = zip(cv.get_feature_names(), neg_cv.sum(axis=0).tolist()[0])
freqs_pos = zip(cv.get_feature_names(), pos_cv.sum(axis=0).tolist()[0])

list_freq_neg = list(freqs_neg)
list_freq_pos = list(freqs_pos)


list_freq_neg.sort(key=lambda tup: tup[1], reverse=True)
list_freq_pos.sort(key=lambda tup: tup[1], reverse=True)

cv_words_neg = [i[0] for i in list_freq_neg]
cv_counts_neg = [i[1] for i in list_freq_neg]

cv_words_pos = [i[0] for i in list_freq_pos]
cv_counts_pos = [i[1] for i in list_freq_pos]

plt.bar(cv_words_neg[0:10], cv_counts_neg[0:10])
plt.xticks(rotation='vertical')
plt.title('Top Negative Words With CountVectorizer')
plt.show()

plt.bar(cv_words_pos[0:10], cv_counts_pos[0:10])
plt.xticks(rotation='vertical')
plt.title('Top Positive Words With CountVectorizer')
plt.show()

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfTransformer(stop_words='english', binary=False, ngram_range=(1,3))

neg_tv = tv.fit_transform(neg_tweets['text'].tolist())
pos_tv = tv.fit_transform(pos_tweets['text'].tolist())

freqs_neg_tv = zip(tv.get_feature_names(), neg_tv.sum(axis=0).tolist()[0])
freqs_pos_tv = zip(tv.get_feature_names(), pos_tv.sum(axis=0).tolist()[0])
list_freq_neg_tv = list(freqs_neg_tv)
list_freq_pos_tv = list(freqs_pos_tv)
list_freq_neg_tv.sort(key=lambda tup: tup[1], reverse=True)
list_freq_pos_tv.sort(key=lambda tup: tup[1], reverse=True)

cv_words_neg_tv = [i[0] for i in list_freq_neg_tv]
cv_counts_neg_tv = [i[1] for i in list_freq_neg_tv]

cv_words_pos_tv = [i[0] for i in list_freq_pos_tv]
cv_counts_pos_tv = [i[1] for i in list_freq_pos_tv]
plt.bar(cv_words_neg_tv[0:10], cv_counts_neg_tv[0:10])
plt.xticks(rotation='vertical')
plt.title('Top Negative Words With tf-idf')
plt.show()

plt.bar(cv_words_pos_tv[0:10], cv_counts_pos_tv[0:10])
plt.xticks(rotation='vertical')
plt.title('Top Positive Words with tf-idf')
plt.show()

x = dataset['text']
y = dataset['target']

cv = CountVectorizer(stop_words='english', binary=False, ngram_range=(1,3))
x_cv = cv.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split

TRAIN_SIZE = 0.7

x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_cv, y, test_size=TRAIN_SIZE, random_state=42)

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model = BernoulliNB()
model.fit(x_train_cv,y_train_cv)

y_pred_cv = model.predict(x_test_cv)
print(confusion_matrix(y_test_cv,y_pred_cv))
print(classification_report(y_test_cv,y_pred_cv))