In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import TweetTokenizer
import nltk

In [2]:
file = 'TweetsElonMusk.csv'
# read csv into pandas df
df = pd.read_csv(file)

# shuffle df
df = df.sample(frac=1).reset_index(drop=True)

# make boolean category of whether Musk was dating Grimes
# From the time that it was announced that they were quietly dating to their post about separating
df['datingGrimes'] = df.apply(lambda i : 'T' if i.date >= '2018-05-07' and i.date < '2021-09-24' else 'F', axis=1)

In [3]:
def tweet_features(tweet, word_features):
        
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
            features['contains({})'.format(word)] = (word in tweet_words)
        
    return features

In [4]:
tt = TweetTokenizer()
    
datingGrimes = df['datingGrimes'].to_list()
    
tweets_raw = df['tweet'].to_list()
tweets_tokenized = [tt.tokenize(tweet) for tweet in tweets_raw]

words = sum(tweets_tokenized, [])
words_freqdist = nltk.FreqDist(w.lower() for w in words)
word_features = list(words_freqdist)[:2000]

featuresets = [(tweet_features(tweets_tokenized[i], word_features), datingGrimes[i]) for i in range(len(tweets_tokenized))]

In [5]:
part = int(len(tweets_raw) * 0.1)
train_set, test_set = featuresets[part:], featuresets[:part]
train_tweets, test_tweets = tweets_raw[part:], tweets_raw[:part]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [11]:
print(nltk.classify.accuracy(classifier, test_set))

0.8192675159235668


In [6]:
classifier.show_most_informative_features(20)

Most Informative Features
             contains(") = True                F : T      =    147.3 : 1.0
           contains(...) = True                F : T      =     58.8 : 1.0
             contains(️) = True                T : F      =     27.5 : 1.0
         contains(isn't) = True                F : T      =     26.4 : 1.0
     contains(@annerajb) = True                T : F      =     23.6 : 1.0
        contains(@verge) = True                F : T      =     21.3 : 1.0
     contains(spaceship) = True                F : T      =     17.9 : 1.0
  contains(appreciation) = True                F : T      =     16.2 : 1.0
          contains(thru) = True                F : T      =     16.2 : 1.0
  contains(announcement) = True                F : T      =     14.8 : 1.0
      contains(landings) = True                F : T      =     12.8 : 1.0
     contains(droneship) = True                F : T      =     12.2 : 1.0
contains(@13ericralph31) = True                T : F      =     11.4 : 1.0

In [7]:
def predict(tweet, featureset, classifier):
    print('Tweet: ')
    print(tweet)
    guess = classifier.classify(featureset[0])
    if guess == 'T':
        print('Do we think Musk was dating Grimes when he wrote this? YES.')
    else:
        print('Do we think musk was dating Grimes when he wrote this? NO.')

    def yn(guess):
        if guess == featureset[1]:
            return 'YES'
        else:
            return 'NO'
    
    print('Were we correct? ' + yn(guess))
    print()

In [8]:
[predict(test_tweets[i], test_set[i], classifier) for i in range(0, 20)]

Tweet: 
Boring machine segments have been lowered into the starter tunnel. Going through final assembly.…  https://t.co/2jg03I7j4n
Do we think Musk was dating Grimes when he wrote this? YES.
Were we correct? NO

Tweet: 
@Trevorcochran2 🤔
Do we think Musk was dating Grimes when he wrote this? YES.
Were we correct? YES

Tweet: 
@Erdayastronaut @DJSnM @KevinKling12 @spaceXcentric Distance from fireball is 0.5*a*t^2, so if t is small, you haven’t moved far even if a is high. At ~6g thrust, you’ll only travel ~0.03m in 100 ms. Pressure wave (aka explosion) with liquid rockets is low, as ox &amp; fuel are poorly mixed. If you can fly out of it, you’re prob ok.
Do we think Musk was dating Grimes when he wrote this? YES.
Were we correct? YES

Tweet: 
@Teslarati Sheer magnitude of the entire production system is hard to appreciate. Almost every element of production is &gt;75% automated. Only wire harnesses &amp; general assembly, which are &lt;10% of production costs, are primarily manual.
Do 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]