In [1]:
from collections import Counter
import nltk
import spacy
import re
from sqlalchemy import create_engine
import pandas as pd

### Cornell Movie--Dialogs Corpus 

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'
table_name = 'dialogs'

In [3]:
db_url = f"postgresql://{postgres_user}:{postgres_pw}@{postgres_host}:{postgres_port}/{postgres_db}"

engine = create_engine(db_url)

df = pd.read_sql_query(f"SELECT * FROM {table_name};", con=engine)

engine.dispose()

In [4]:
df.head()

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...
4,4,You're asking me out. That's so cute. What's ...


In [5]:
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.max_length = 20000000

dialogs = " ".join(df.dialogs)
dialogs = ' '.join(dialogs.split())

In [6]:
dialogs_doc = nlp(dialogs)

In [7]:
# Explore the objects that you've built.
print(f"The dialogs_doc object is a {type(dialogs_doc)} object.")
print(f"It is {len(dialogs_doc)} tokens long")
print(f"The first three tokens are '{dialogs_doc[:3]}'")
print(f"The type of each token is {type(dialogs_doc[0])}")

The dialogs_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 4189489 tokens long
The first three tokens are 'Can we make'
The type of each token is <class 'spacy.tokens.token.Token'>


In [8]:
# Removing stop words
dialogs_without_stopwords = [token for token in dialogs_doc if not token.is_stop]

In [9]:
# Utility function to calculate how frequently words appear in the text
def word_frequencies(text):
    
    # Build a list of words
    # Strip out punctuation
    words = []
    for token in text:
        if not token.is_punct:
            words.append(token.text)
            
    # Build and return a `Counter` object containing word counts
    return Counter(words)

# Instantiate your list of the most common words
dialogs_word_freq = word_frequencies(dialogs_without_stopwords).most_common(10)
print(f'Dialogs word freqency: {dialogs_word_freq}')

Dialogs word freqency: [('know', 21478), ('like', 13765), ('got', 12663), ('want', 10800), ('think', 10427), ('going', 8770), ('right', 8710), ('>', 7669), ('Oh', 7516), ('time', 6452)]


In [10]:
# Utility function to calculate how frequently each lemma appears in the text
def lemma_frequencies(text):
    
    # Build a list of lemmas
    # Strip out punctuation
    lemmas = []
    for token in text:
        if not token.is_punct:
            lemmas.append(token.lemma_)
            
    # Build and return a `Counter` object containing lemma counts
    return Counter(lemmas)

# Instantiate your list of most common lemmas
dialogs_lemma_freq = lemma_frequencies(dialogs_without_stopwords).most_common(10)
print(f'Dialogs lemma frequency: {dialogs_lemma_freq}')

Dialogs lemma frequency: [('know', 24745), ('go', 16867), ('like', 15621), ('get', 15302), ('think', 15044), ('want', 13985), ('come', 11007), ('tell', 10487), ('right', 10103), ('look', 8806)]


In [13]:
# Initial exploration of sentences
sentences = list(dialogs_doc.sents)
print("The dialogs doc has {} sentences.".format(len(sentences)))

example_sentence = sentences[3]
print(f"Here is an example: {example_sentence}")

The dialogs doc has 478611 sentences.
Here is an example: Well, I thought we'd start with pronunciation, if that's okay with you.


In [15]:
# Look at some metrics around this sentence
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(f"There are {len(example_words)} words in this sentence, and {len(unique_words)} of them are unique.")

There are 14 words in this sentence, and 13 of them are unique.


### Twitter US Airline Sentiment

In [16]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'twitter_sentiment'
table_name = 'twitter'

In [17]:
db_url = f"postgresql://{postgres_user}:{postgres_pw}@{postgres_host}:{postgres_port}/{postgres_db}"

engine = create_engine(db_url)

df = pd.read_sql_query(f"SELECT * FROM {table_name};", con=engine)

engine.dispose()

In [18]:
df.head()

Unnamed: 0,index,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [19]:
df.text[0]

'@VirginAmerica What @dhepburn said.'

In [21]:
tweets = " ".join(df.text)
tweets = " ".join(tweets.split())

In [22]:
tweets_doc = nlp(tweets)

In [23]:
print(f"The tweets_doc object is a {type(tweets_doc)} object.")
print(f"It is {len(tweets_doc)} tokens long")
print(f"The first three tokens are '{tweets_doc[:3]}'")
print(f"The type of each token is {type(tweets_doc[0])}")

The tweets_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 305106 tokens long
The first three tokens are '@VirginAmerica What @dhepburn'
The type of each token is <class 'spacy.tokens.token.Token'>


In [24]:
# Removing stop words
tweets_without_stopwords = [token for token in tweets_doc if not token.is_stop]

In [26]:
tweets_word_freq = word_frequencies(tweets_without_stopwords).most_common(10)
print(f'Word freqency: {tweets_word_freq}')

Word freqency: [('@united', 3733), ('flight', 3178), ('@AmericanAir', 2904), ('@USAirways', 2893), ('@SouthwestAir', 2390), ('@JetBlue', 2176), ('Cancelled', 1065), ('service', 928), ('time', 770), ('Flight', 740)]


In [27]:
tweets_lemma_freq = lemma_frequencies(tweets_without_stopwords).most_common(10)
print(f'Lemma frequency: {tweets_lemma_freq}')

Lemma frequency: [('flight', 3981), ('@USAirways', 2579), ('@unite', 2445), ('@AmericanAir', 2382), ('@JetBlue', 1924), ('thank', 1634), ('@SouthwestAir', 1593), ('@united', 1333), ('hour', 1135), ('delay', 975)]


In [28]:
sentences = list(tweets_doc.sents)
print("The tweets doc has {} sentences.".format(len(sentences)))

example_sentence = sentences[3]
print(f"Here is an example: {example_sentence}")

The tweets doc has 22687 sentences.
Here is an example: @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp;


In [29]:
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(f"There are {len(example_words)} words in this sentence, and {len(unique_words)} of them are unique.")

There are 14 words in this sentence, and 14 of them are unique.
