In [None]:
from nlp_functions import *
import scattertext as st
import io
from pprint import pprint
from scipy.stats import rankdata, hmean, norm
import spacy
import pkgutil, json, urllib
from urllib.request import urlopen
from scattertext import CorpusFromPandas, produce_scattertext_explorer
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
sess = tf.Session()

plt.style.use('seaborn')

In [None]:
# Importing our data
sentiment140 = read_sentiment_file(DATA_DIRECTORY, 'sentiment140_train.csv')
# Sampling our data
sentiment140 = sentiment140.sample(frac=0.1, random_state=7)
# Mapping 0: negative and 4: positive, for visualization
sentiment140['polarity'] = sentiment140['polarity'].map({0: 'negative', 4: 'positive'})

In [None]:
# Countplot of negative and positive polarity of texts
sns.set_palette(['red', 'green', 'blue'])
sns.countplot(x=sentiment140['polarity'])
plt.savefig("../figures/negative_positive.svg", format="svg", dpi=300)

In [None]:
# Number of characters and words per sentece
fig, axs = plt.subplots(1, 2, figsize=(14, 7))
axs[0].hist(sentiment140['text'].str.len(), color='skyblue', bins=10)
axs[0].set_title('Characters in each sentence')
axs[0].set_xlabel('Number of characters')
axs[0].set_ylabel('Count')
axs[1].hist(sentiment140['text'].apply(lambda x: len(x.split())), color='skyblue', bins=10)
axs[1].set_title('Words in each sentence')
axs[1].set_xlabel('Number of words')
axs[1].set_ylabel('Count')
plt.savefig("../figures/characters_words.png", dpi=300);

In [None]:
# English stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))

In [None]:
# Most common stopwords in texts
corpus = []
words = sentiment140['text'].str.split()
words = words.values.tolist()
corpus = [word for i in words for word in i]
dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1
top = sorted(dic.items(), key=lambda x:x[1], reverse=True)[:10] 
x, y = zip(*top)
plt.bar(x, y, color='skyblue')
plt.savefig("../figures/stopwords.png", dpi=300);

In [None]:
# Most common words in texts
counter = Counter(corpus)
most = counter.most_common()
x, y = [], []
for word, count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)
sns.barplot(x=y, y=x)
plt.savefig("../figures/most_common.png", dpi=300);

In [None]:
# Getting top bigrams in texts
corpus = sentiment140['text']
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) 
              for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
top_10_bigrams = words_freq[:10]
x, y = map(list, zip(*top_10_bigrams))
sns.barplot(x=y, y=x)
plt.savefig("../figures/bigrams.png", dpi=300);

In [None]:
# Parsing texts to scatter plot
nlp = spacy.load("en_core_web_sm")
sentiment140['parsed'] = sentiment140.text.apply(nlp)
sentiment140.to_csv('sentiment140_parsed.csv', index=False)

In [None]:
corpus = st.CorpusFromParsedDocuments(sentiment140, category_col='polarity', parsed_col='parsed').build()

In [None]:
html = st.produce_scattertext_explorer(corpus,
                                       category='positive',
                                       category_name='Positive',
                                       not_category_name='Negative',
                                       minimum_term_frequency=5,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'ScattertextGraph.html'
open(file_name, 'wb').write(html.encode('utf-8'))