<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/06_textasdata_partI_textmining_examplecode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [2]:
!pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 2.9MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

## uncomment and download if this is your first 
## time running 
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

## sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## specify to print all output in a call
## and not just first
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
## spacy (still being installed on jhub)
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Load data 

In [6]:
# Clone the entire repo.
!git clone -l -s https://github.com/andrew66882011/qss20_slides_activities.git cloned-repo
%cd cloned-repo
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 332, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 332 (delta 110), reused 22 (delta 22), pack-reused 207[K
Receiving objects: 100% (332/332), 61.50 MiB | 20.54 MiB/s, done.
Resolving deltas: 100% (196/196), done.
/content/cloned-repo
activities  problemsets  public_data  README.md  slides


In [7]:
!unzip ./public_data/airbnb_text.zip

Archive:  ./public_data/airbnb_text.zip
  inflating: airbnb_text.csv         


In [10]:
## if working from within the repo, can use this relative path
path_todata = "./airbnb_text.csv"

## load data
ab = pd.read_csv(path_todata)
ab.head()
ab.info()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   48895 non-null  int64 
 1   name                 48879 non-null  object
 2   name_upper           48879 non-null  object
 3   neighbourhood_group  48895 non-null  object
 4   price                48895 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


# Text mining

## Manual approach 1: look for a single word

In [11]:
## using the `name_upper` var, look at where reviews mention cozy
ab['is_cozy'] = np.where(ab.name_upper.str.contains("COZY"), True, False)

## find the mean price by neighborhood and whether cozy
mp = pd.DataFrame(ab.groupby(['is_cozy', 'neighbourhood_group'])['price'].mean())

## reshape to wide format so that each borough is row
## and one col is the mean price for listings that describe
## the place as cozy; other col is mean price for listings
## without that word
mp_wide = pd.pivot_table(mp, index = ['neighbourhood_group'],
                        columns = ['is_cozy'])

mp_wide.columns = ['no_mention_cozy', 'mention_cozy']

mp_wide

Unnamed: 0_level_0,no_mention_cozy,mention_cozy
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,89.231088,74.214286
Brooklyn,128.175441,91.130224
Manhattan,204.109775,129.91714
Queens,102.596682,80.344388
Staten Island,120.650307,74.319149


## Manual approach 2: score based on dictionary of words

In [12]:
## construct dictionary
space_indicators = {'small': ['COZY', 'COMFY', 'LITTLE', 'SMALL'],
                   'large': ['SPACIOUS', 'LARGE', 'HUGE', 'GIANT']}


## for each listing, find the number of occurrences
## of words in each key

### first, let's test with one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"

### splitting that string at space and looking at overlap with each key
### first, look at overlap with the list containing words for small
words_overlap_small = [word for word in practice_listing.split(" ") if 
                      word in space_indicators['small']]
words_overlap_small

### then, look at overlap with the list containing words for large
words_overlap_large = [word for word in practice_listing.split(" ") if 
                      word in space_indicators['large']]
words_overlap_large

### could then take length as a fraction of all words
len(words_overlap_small)/len(practice_listing.split(" "))
len(words_overlap_large)/len(practice_listing.split(" "))


['COZY', 'LITTLE']

[]

0.3333333333333333

0.0

## Part of speech tagging

In [20]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
## specify example
example_for_tag = "This is a chill apt next to the subway in LES Chinatown"
example_for_tag

'This is a chill apt next to the subway in LES Chinatown'

In [23]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [24]:
## try part of speech tagging using nltk
tokens = word_tokenize(example_for_tag) # Generate list of tokens
tokens_pos = pos_tag(tokens) # generate part of speech tags for those tokens
 
## returns a list of tuples
## first element in tuple is a word
## second element in tuple is the part of speech
for one_tok in tokens_pos:
    print(one_tok)

('This', 'DT')
('is', 'VBZ')
('a', 'DT')
('chill', 'NN')
('apt', 'JJ')
('next', 'JJ')
('to', 'TO')
('the', 'DT')
('subway', 'NN')
('in', 'IN')
('LES', 'NNP')
('Chinatown', 'NNP')


In [25]:
## use list iteration to extract proper nouns (NNP)
## i'm first checking if the second element in the tuple
## is equal to NNP
## if so, i'm returning the first element in the tuple (the 
## actual word)
all_prop_noun = [one_tok[0] for one_tok in tokens_pos 
                if one_tok[1] == "NNP"]
all_prop_noun

['LES', 'Chinatown']

## Named Entity Recognition

In [26]:
## modified from: https://twitter.com/dartmouth/status/1387488541844856838

## tweet
d_tweet = """Dependents, partners, and household members of
Dartmouth College students, staff, and faculty who are 18 or older are
now eligible to sign up for COVID-19 vaccination clinics on May 5 and May 6.
The deadline to sign up is 11:59 p.m. on April 29. These are in New Hampshire.
"""

In [27]:
spacy_dtweet = nlp(d_tweet)
for one_tok in spacy_dtweet.ents:
    print("Entity: " + one_tok.text + "; NER tag: " + one_tok.label_)

Entity: Dartmouth College; NER tag: ORG
Entity: 18 or older; NER tag: DATE
Entity: May 5; NER tag: DATE
Entity: May 6; NER tag: DATE
Entity: 11:59 p.m.; NER tag: TIME
Entity: April 29; NER tag: DATE
Entity: New Hampshire; NER tag: GPE


In [28]:
## try a couple variations
## eg removing college, NH compared to New Hampshire
## capitalize faculty

## Sentiment analysis

### Using the default scorer on a few example phrases

In [30]:
## initialize a scorer
sent_obj = SentimentIntensityAnalyzer()

## score one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"
sentiment_example = sent_obj.polarity_scores(practice_listing)
sentiment_example

{'compound': 0.4215, 'neg': 0.0, 'neu': 0.641, 'pos': 0.359}

In [31]:
## adding phrase with word terrible and score
practice_listing_2 = "NICE AND COZY LITTLE APT AVAILABLE. REALLY TERRIBLE VIEW."
sentiment_example_2 = sent_obj.polarity_scores(practice_listing_2)

## adding phrase about rats; bad but might not be in scoring dictionary
practice_listing_3 = "NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH."
sentiment_example_3 = sent_obj.polarity_scores(practice_listing_3)

In [None]:
## summarize all 3
print("String: " + practice_listing + " scored as:\n" + str(sentiment_example))
print("String: " + practice_listing_2 + " scored as:\n" + str(sentiment_example_2))
print("String: " + practice_listing_3 + " scored as:\n" + str(sentiment_example_3))


String: NICE AND COZY LITTLE APT AVAILABLE scored as:
{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'compound': 0.4215}
String: NICE AND COZY LITTLE APT AVAILABLE. REALLY TERRIBLE VIEW. scored as:
{'neg': 0.257, 'neu': 0.531, 'pos': 0.212, 'compound': -0.1513}
String: NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH. scored as:
{'neg': 0.0, 'neu': 0.741, 'pos': 0.259, 'compound': 0.4215}


### Updating the dictionary with manually-added words

In [None]:
## lexicon is a dictionary where the key
## is the word
## the value is the score (negative = negative)
## here, i'm benchmarking the negativity of the
## rodents to the negativity of the word aversion
sent_obj.lexicon['aversion']

-1.9

In [None]:
## create a dictionary with 
## negative scores for pests
pest_words = {
    'rat': -1.9,
    'rats': -1.9,
    'mice': -1.9,
    'mouse': -1.9,
    'roach': -1.9,
    'cockroach': -1.9
}


## initiate new sentiment object
## so that we don't alter old one
## use.update to add new words
new_si = SentimentIntensityAnalyzer()
new_si.lexicon.update(pest_words)

## try re-scoring the third example
## see negative
print("After lexicon update: " + practice_listing_3 + " scored as:\n" + \
      str(new_si.polarity_scores(practice_listing_3)))

After lexicon update: NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH. scored as:
{'neg': 0.228, 'neu': 0.551, 'pos': 0.22, 'compound': -0.0258}
