# Lexicon Expansion

In this notebook, tweets regarding vaping are analyzed against one another on the basis of whether they contain at least one of a set of words.

In [67]:
import nltk
nltk.download('stopwords')
import numpy as np

from nltk.book import *
from string import punctuation
from collections import Counter

from pprint import pprint # get some prettier printing of objects

from nltk.corpus import stopwords

sw = stopwords.words('english')

import sqlite3
from collections import Counter, defaultdict
from string import punctuation



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aidan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset Manipulation

In this section of the notebook the dataset, a .csv file of vaping related tweets, is read in. Next, the tweets are split into two groups. The first group is tweets containing the word bathroom, and the second is tweets that do not. This process is repeated with an expanding set of words.

In [68]:
tweets = []
with open("vaping_tweets.csv", encoding = "UTF-8") as vape_tweets:
    tweets = tweets = [tweet.strip() for tweet in vape_tweets.readlines()]


In [69]:
bathroom_tweets = [tweet for tweet in tweets if "bathroom" in tweet]

non_bathroom = [tweet for tweet in tweets if "bathroom" not in tweet]

bathroom_clean = "".join(bathroom_tweets)
bathroom_clean = [tweet.lower() for tweet in bathroom_clean.split() if tweet.isalpha() and tweet not in sw]

bathroom_freq = nltk.FreqDist(bathroom_clean)

broomLen = sum([ch for w, ch in bathroom_freq.items()])

broom_concentration = dict()

for word, count in bathroom_freq.items():
    broom_concentration[word] = count/broomLen

In [70]:
#nicotine, kids, vaping

In [71]:
group1_tweets = [tweet for tweet in tweets if "bathroom" in tweet or "nicotine" in tweet or "kids" in tweet or "vaping" in tweet]

group1_cleaned = " ".join(group1_tweets)
group1_cleaned = [tweet.lower() for tweet in group1_cleaned.split() if tweet.isalpha() and tweet not in sw]

non_group1 = [tweet for tweet in tweets if "bathroom" not in tweet and "nicotine" not in tweet and "kids" not in tweet and "vaping" not in tweet]
group1_freq = nltk.FreqDist(group1_cleaned)

g1Len = sum([ch for w, ch in group1_freq.items()])

g1_concentration = dict()

for word, count in group1_freq.items():
    g1_concentration[word] = count/g1Len


In [72]:
#pod, smoking
group2_tweets = [tweet for tweet in tweets if "bathroom" in tweet or "nicotine" in tweet or "kids" in tweet or "vaping" in tweet or "pod" in tweet or "smoking" in tweet]
non_group2 = [tweet for tweet in tweets if "bathroom" not in tweet and "nicotine" not in tweet and "kids" not in tweet and "vaping" not in tweet and "pod" not in tweet and "smoking" not in tweet]

group2_cleaned = " ".join(group2_tweets)
group2_cleaned = [tweet.lower() for tweet in group2_cleaned.split() if tweet.isalpha() and tweet not in sw]


group2_freq = nltk.FreqDist(group2_tweets)

g2Len = sum([ch for w, ch in group2_freq.items()])

g2_concentration = dict()

for word, count in group2_freq.items():
    g2_concentration[word] = count/g1Len



Final List: addicted, aspire, stop, bathroom, pod, smoking, nicotine, kids, vaping.

# Analysis

Once the final set has been determined, the tweets are once again split into two groups. One containing the words, and the other not. The vapeAnalyzer function is then run against the groups. This produces a dictionary containing summary statistics and comparative statistics of the two groups

In [74]:
wordSet_tweets = [tweet for tweet in tweets if "bathroom" in tweet or "nicotine" in tweet or "kids" in tweet or "vaping" in tweet or "pod" in tweet or "smoking" in tweet or "addicted" in tweet or "aspire" in tweet or "stop" in tweet]
non_wordSet = [tweet for tweet in tweets if "bathroom" not in tweet and "nicotine" not in tweet and "kids" not in tweet and "vaping" not in tweet and "pod" not in tweet and "smoking" not in tweet and "aspire" not in tweet and "addicted" not in tweet and "stop" not in tweet]


set_cleaned = " ".join(wordSet_tweets)
set_cleaned = [tweet.lower() for tweet in set_cleaned.split() if tweet.isalpha() and tweet not in sw]

nonSet_cleaned = " ".join(non_wordSet)
nonSet_cleaned = [tweet.lower() for tweet in nonSet_cleaned.split() if tweet.isalpha() and tweet not in sw]

In [75]:
def vapeAnalyzer(corpus_1,corpus_2,num_words,ratio_cutoff):
    combined_dict1 = {}
    combined_dict2 = {}
    oneV2Dict = {}
    twoV1Dict = {}
    outputDict = {"one":combined_dict1,"two":combined_dict2,"one_vs_two": oneV2Dict,"two_vs_one":twoV1Dict}
    
    c = Counter()
    
    total_tokens = 1
    unique_tokens = 0
    avg_token_len = 0.0
    lex_diversity = 0.0
    top_10 = Counter()
    
    #Summary Stats for 1
    c1_clean = [word for word in corpus_1]
    c1_clean = [word.lower() for word in c1_clean if word.isalpha() and word.lower() not in sw]
    
    
    c.update(c1_clean)
        
     # Calculate your statistics here
    total_tokens = len(c1_clean)
    unique_tokens = len(set(c1_clean))
    lex_diversity = (unique_tokens/total_tokens)
    
    
    token_len = [len(word) for word in c1_clean]
    
    avg_token_len = np.mean(token_len)
    top_10 = c.most_common(10)
    
    combined_dict1 = {'tokens':total_tokens,
                'unique_tokens':unique_tokens,
                'avg_token_length':avg_token_len,
                'lexical_diversity':lex_diversity,
                'top_words':top_10}
    
    #Summary Stats for 2
    c2_clean = [word for word in corpus_2]
    c2_clean = [word.lower() for word in c2_clean if word.isalpha() and word.lower() not in sw]
    
    
    c.update(c2_clean)
        
     # Calculate statistics here
    total_tokens = len(c2_clean)
    unique_tokens = len(set(c2_clean))
    lex_diversity = (unique_tokens/total_tokens)
    
    
    token_len = [len(word) for word in c2_clean]
    
    avg_token_len = np.mean(token_len)
    top_10 = c.most_common(10)
    
        # Now we'll fill out the dictionary. 
    combined_dict2 = {'tokens':total_tokens,
                'unique_tokens':unique_tokens,
                'avg_token_length':avg_token_len,
                'lexical_diversity':lex_diversity,
                'top_words':top_10}
    
    freq1 = nltk.FreqDist(c1_clean)
    freq2 = nltk.FreqDist(c2_clean)
    
    corpus_one_concentration = defaultdict(float)
    corpus_two_concentration = defaultdict(float)
    
    for word, freq in freq1.items():
        corpus_one_concentration[word] = freq/len(c1_clean)
    
    for word, freq in freq2.items():
        corpus_two_concentration[word] = freq/len(c2_clean)
        
    
    
    oneV2Dict = {}
    twoV1Dict = {}
    
    for word in corpus_one_concentration:
        if ratio_cutoff > freq1[word] or ratio_cutoff > freq2[word]:
            continue
        ratio1 = corpus_one_concentration[word]/corpus_two_concentration[word] 
        
        oneV2Dict[word] = ratio1
    
    for word in corpus_two_concentration:
        if ratio_cutoff > freq1[word] or ratio_cutoff > freq2[word]:
            continue
        ratio2 = corpus_two_concentration[word]/corpus_one_concentration[word] 
        
        twoV1Dict[word] = ratio2
        
    
    
    
    outputDict = {"Set_Tweets":combined_dict1,"Non_Set_Tweets":combined_dict2,"Set_vs_Not": sorted(oneV2Dict.items(), key = lambda item:item[1], reverse = True)[:num_words],"Not_Vs_Set":sorted(twoV1Dict.items(), key = lambda item:item[1], reverse = True)[:num_words]}
    print(outputDict)

# Results

Below is the ouptut of the vapeAnalyzer function. Visible are summary statistics for both sets of tweets, as well as a ratio comparison.

In [76]:
vapeAnalyzer(set_cleaned,nonSet_cleaned,10,3)

{'Set_Tweets': {'tokens': 2684153, 'unique_tokens': 55266, 'avg_token_length': 5.555872187613747, 'lexical_diversity': 0.02058973538393676, 'top_words': [('juul', 99320), ('smoking', 69123), ('vaping', 56738), ('nicotine', 45649), ('new', 26038), ('vape', 24850), ('like', 24222), ('get', 15779), ('pod', 15454), ('people', 14723)]}, 'Non_Set_Tweets': {'tokens': 1102561, 'unique_tokens': 44369, 'avg_token_length': 5.388470116392653, 'lexical_diversity': 0.04024176440124401, 'top_words': [('juul', 127534), ('smoking', 69123), ('vaping', 56738), ('vape', 55713), ('nicotine', 45649), ('like', 34633), ('new', 31919), ('cigarettes', 26424), ('get', 22653), ('people', 19318)]}, 'Set_vs_Not': [('cleito', 128.77539898060954), ('pockex', 81.60567548372491), ('annie', 79.82568591780473), ('conscience', 48.88125192565402), ('vows', 47.135492928309226), ('bvc', 32.1425038922893), ('carole', 29.71213352343676), ('vgod', 27.99375898095228), ('evo', 27.384454860310374), ('viral', 25.03110327075245)], '