In [1]:
!pip install scattertext

Collecting scattertext
  Downloading scattertext-0.1.19-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting flashtext (from scattertext)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9296 sha256=578404b5c91b18fa89499b8ac85400cc54d0610d139fc6db9a78699234db0805
  Stored in directory: /root/.cache/pip/wheels/bc/be/39/c37ad168eb2ff644c9685f52554440372129450f0b8ed203dd
Successfully built flashtext
Installing collected packages: flashtext, scattertext
Successfully installed flashtext-2.7 scattertext-0.1.19


In [2]:
!pip install empath

Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57800 sha256=284137e11ee7766daab8b3f9edc5ad42a8431b9edf2447d447367805802399da
  Stored in directory: /root/.cache/pip/wheels/92/b3/83/9eb2c6199881e2385a59d99bd911363475060ebeb4bdb27242
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


## LOADING LIBRARIES

In [3]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen

import IPython
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer

display(HTML("<style>.container {width:98% !important;}</style>"))

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
df = pd.read_csv("cleaned_tweets.csv")

## Printing the top 5 entries of the Dataset

In [6]:
df.head()

Unnamed: 0,tweet_created,airline_sentiment,negativereason,text
0,24/02/15 11:35,neutral,,What dhepburn say
1,24/02/15 11:15,positive,,plus add commercial experience tacky
2,24/02/15 11:15,neutral,,today Must mean need take another trip
3,24/02/15 11:15,negative,Bad Flight,really aggressive blast obnoxious entertainmen...
4,24/02/15 11:14,negative,Can't Tell,really thing


## Printing the Information on Data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_created      499 non-null    object
 1   airline_sentiment  499 non-null    object
 2   negativereason     181 non-null    object
 3   text               496 non-null    object
dtypes: object(4)
memory usage: 15.7+ KB


## How tweets are categorized in three categories
### Negative
### Neutral
### Positive

In [8]:
df.airline_sentiment.value_counts()

negative    181
neutral     169
positive    149
Name: airline_sentiment, dtype: int64

## Parse all the words in the text column

In [9]:
df['text'] = df['text'].astype(str)

In [10]:
df['text'] = df.text.apply(nlp)

In [11]:
df.head()

Unnamed: 0,tweet_created,airline_sentiment,negativereason,text
0,24/02/15 11:35,neutral,,"(What, dhepburn, say)"
1,24/02/15 11:15,positive,,"(plus, add, commercial, experience, tacky)"
2,24/02/15 11:15,neutral,,"(today, Must, mean, need, take, another, trip)"
3,24/02/15 11:15,negative,Bad Flight,"(really, aggressive, blast, obnoxious, enterta..."
4,24/02/15 11:14,negative,Can't Tell,"(really, thing)"


In [12]:
corpus = st.CorpusFromParsedDocuments(df, category_col='airline_sentiment', parsed_col='text').build()

In [13]:
html = produce_scattertext_explorer(corpus,
                                    category = 'negative',
                                    category_name = 'negative',
                                    not_category_name = 'positive',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.log_scale_standardize)

file_name = 'AirlineTweetScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

# Create a corpus of extracted topics

In [14]:

feat_builder = st.FeatsFromOnlyEmpath()
empath_corpus = st.CorpusFromParsedDocuments(df,
                                             category_col='airline_sentiment',
                                             feats_from_spacy_doc=feat_builder,
                                             parsed_col='text').build()

# Visualize Empath topics

In [16]:

html = produce_scattertext_explorer(empath_corpus,
                                    category='negative',
                                    category_name='negative',
                                    not_category_name='positive',
                                    width_in_pixels=1000,
                                    use_non_text_features=True,
                                    use_full_doc=True,
                                    topic_model_term_lists=feat_builder.get_top_model_term_lists())
file_name = 'AirlineSentimentEmpath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=600)