In [12]:
# Setting up libraries
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, inspect
from config import user, password, hostname
from sklearn.feature_extraction.text import CountVectorizer

### Connecting to database

In [9]:
# Create engine
engine = create_engine(f'postgres://{user}:{password}@{hostname}/twitter_vs_stocks')

# Use the Inspector to explore the database and print the table names
inspector = inspect(engine)
inspector.get_table_names()

['stock', 'tweets_text', 'twitter_vs_stocks']

In [10]:
# Create dataframe from SQL table
twitter_vs_stocks = pd.read_sql_table(
    'twitter_vs_stocks',
    con=engine)
twitter_vs_stocks.set_index(['date'], inplace=True)
twitter_vs_stocks.head()

Unnamed: 0_level_0,tokenized_text,like_count,quote_count,reply_count,retweet_count,change,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-07-09,"['solar', 'powerwall', 'battery', 'ensures', '...",37454,400,4443,2871,4.140015,18118500
2021-07-09,"['tesla', 'solar', 'roof', 'powerwall', 'major...",13972,119,2185,1407,4.140015,18118500
2021-07-09,"['autonomous', 'spacex', 'droneship', 'shortfa...",63291,860,3653,6553,4.140015,18118500
2021-07-09,"['electrekco', 'bought', 'first', 'tesla', her...",0,0,0,979,4.140015,18118500
2021-07-08,"['maybe', 'movie', 'gaslit', 'us']",26485,92,1927,1045,8.159973,22773300


### Defining the most common words in tweets

In [25]:
# define function for counting words
def count_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq = sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]

In [26]:
# count words in tweets
words_in_tweets = count_words(twitter_vs_stocks.tokenized_text)
words_in_tweets[:20]

[('spacex', 120),
 ('tesla', 81),
 ('falcon', 43),
 ('launch', 40),
 ('first', 34),
 ('dragon', 28),
 ('model', 22),
 ('mission', 21),
 ('crew', 21),
 ('starship', 20),
 ('flight', 19),
 ('landing', 17),
 ('space_station', 17),
 ('starlink', 16),
 ('doo', 15),
 ('stage', 15),
 ('next', 14),
 ('giga', 14),
 ('nasa', 14),
 ('doge', 13)]

In [36]:
# analyse only words that appear 3 or more times
words_to_analyse = [word for word, freq in words_in_tweets if freq >= 3]
len(words_to_analyse)

330