In [1]:
import pandas as pd
import numpy as np
import re
import scipy

### Creating dataframe from csv

In [2]:
df = pd.read_csv('tweet_csvs/realDonaldTrump_tweets.csv', index_col = None, header = 0, 
                     parse_dates=['created_at'], infer_datetime_format = True, dayfirst = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3378 entries, 0 to 3377
Data columns (total 3 columns):
id            3378 non-null int64
created_at    3378 non-null datetime64[ns]
text          3378 non-null object
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 79.2+ KB


In [4]:
df.head(2)

Unnamed: 0,id,created_at,text
0,820251730407473153,2017-01-14 12:50:26,Congressman John Lewis should spend more time ...
1,820255947956383744,2017-01-14 13:07:12,mention crime infested) rather than falsely co...


In [5]:
df.tail(2)

Unnamed: 0,id,created_at,text
3376,1014090584963866624,2018-07-03 10:16:51,"Crazy Maxine Waters, said by some to be one of..."
3377,1013976609290964997,2018-07-03 02:43:57,Many Democrats are deeply concerned about the ...


In [6]:
df_train = df['text'].tolist()

### CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vect_bow = CountVectorizer(ngram_range=(1, 1), stop_words = 'english')

In [9]:
trump_bow = vect_bow.fit_transform(df_train)

### TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
#from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect_tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words = 'english')

# Pulls all of trumps tweet text's into one giant string
#summaries = "".join(df['text'])
#ngrams_summaries = vect.build_analyzer()(summaries)

#Counter(ngrams_summaries).most_common(20)

In [11]:
trump_tfidf = vect_tfidf.fit_transform(df_train)

## Test Strings

In [12]:
#test sentence
#sentence = 'To all the little girls watching...never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world.'
test = ['Come on and kill Kenny!', 
        'Make America great again!', 
        'Beer... Beeeeer... Beeeeeeeeer... WOO-HOO!']

In [13]:
test_tfidf = vect_tfidf.transform(test)

In [14]:
test_bow = vect_bow.transform(test)

In [15]:
#test_tfidf.shape

### OneClassSVM

In [16]:
from sklearn import svm

In [17]:
ocsvm = svm.OneClassSVM(nu = 0.5, kernel = 'rbf', gamma = 0.1)

In [18]:
y_true_bow = [1 for i in range(trump_bow.shape[0])]
y_true_tfidf = [1 for i in range(trump_tfidf.shape[0])]

In [19]:
ocsvm.fit(trump_bow, y = y_true_bow)
prediction_bow = ocsvm.predict(test_bow)
prediction_bow

array([1, 1, 1])

In [20]:
ocsvm.fit(trump_tfidf, y = y_true_tfidf)
prediction_tfidf = ocsvm.predict(test_tfidf)
prediction_tfidf

array([-1,  1,  1])

### word2vec

In [21]:
from gensim.models import Word2Vec

In [22]:
from nltk.tokenize import wordpunct_tokenize, TweetTokenizer, RegexpTokenizer

In [23]:
regexp_tok = RegexpTokenizer(r'\w+')

In [24]:
%%time
tokenized_tweets = []
for tweet in df_train:
    tokenized = regexp_tok.tokenize(tweet)
    tokenized_tweets.append(tokenized)

CPU times: user 48.6 ms, sys: 33 µs, total: 48.6 ms
Wall time: 48.2 ms


In [25]:
%%time
w2v_model = Word2Vec(tokenized_tweets, min_count = 1)

CPU times: user 1.7 s, sys: 31.7 ms, total: 1.73 s
Wall time: 883 ms


In [62]:
df_train_tokenized = []
for tweet in df_train:
    tweet_tokenized = regexp_tok.tokenize(tweet)
    df_train_tokenized.append(tweet_tokenized)

In [67]:
%%time
wmd_list = []
for sentence in test:
    wmd_list_temp = []
    for tweet in df_train_tokenized:
        wmd = w2v_model.wv.wmdistance(tweet, regexp_tok.tokenize(sentence))
        wmd_list_temp.append(wmd)
    wmd_list.append(np.max(wmd_list_temp))

CPU times: user 25.4 s, sys: 39.9 ms, total: 25.4 s
Wall time: 25.4 s


In [68]:
for i in wmd_list:
    if np.isinf(i) == False:
        print('{} is a valid value'.format(i))
    else:
        print('{} is NOT a valid value'.format(i))

5.643882510278225 is a valid value
5.8476670864133835 is a valid value
inf is NOT a valid value


### Cosine Similarity & Tests

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
i = 0

for sentence in test:
    sentence_bow = vect_bow.transform([sentence])
    cos_dists_bow = cosine_similarity(trump_bow, sentence_bow)
    
    sentence_tfidf = vect_tfidf.transform([sentence])
    cos_dists_tfidf = cosine_similarity(trump_tfidf, sentence_tfidf)
    
    mean_cos_dist = np.mean([np.max(cos_dists_bow), np.max(cos_dists_tfidf)])
    
    print('>>> {}'.format(sentence))
    print('Cosine similarity Bag-Of-Words: {}'.format(round(np.max(cos_dists_bow), 3)))
    print('Cosine similarity TF-IDF: {}'.format(round(np.max(cos_dists_tfidf), 3)))
    print('Mean Cosine Similarity: {}\n'.format(round(mean_cos_dist, 3)))
    print('OneClassSVM BOW prediction: {}'.format(prediction_bow[i]))
    print('OneClassSVM TF-IDF prediction: {}\n'.format(prediction_tfidf[i]))
    print('Word2Vec Word Mover`s Distance: {}\n'.format(wmd_list[i]))
    i += 1

>>> Come on and kill Kenny!
Cosine similarity Bag-Of-Words: 0.267
Cosine similarity TF-IDF: 0.327
Mean Cosine Similarity: 0.297

OneClassSVM BOW prediction: 1
OneClassSVM TF-IDF prediction: -1

Word2Vec Word Mover`s Distance: 5.643882510278225

>>> Make America great again!
Cosine similarity Bag-Of-Words: 1.0
Cosine similarity TF-IDF: 1.0
Mean Cosine Similarity: 1.0

OneClassSVM BOW prediction: 1
OneClassSVM TF-IDF prediction: 1

Word2Vec Word Mover`s Distance: 5.8476670864133835

>>> Beer... Beeeeer... Beeeeeeeeer... WOO-HOO!
Cosine similarity Bag-Of-Words: 0.0
Cosine similarity TF-IDF: 0.0
Mean Cosine Similarity: 0.0

OneClassSVM BOW prediction: 1
OneClassSVM TF-IDF prediction: 1

Word2Vec Word Mover`s Distance: inf



### Saving models

In [65]:
from sklearn.externals import joblib

joblib.dump(df_train_tokenized, 'df_train_tokenized.pkl')

joblib.dump(vect_bow, 'vect_bow.pkl')
joblib.dump(trump_bow, 'trump_bow.pkl')

joblib.dump(vect_tfidf, 'vect_tfidf.pkl')
joblib.dump(trump_tfidf, 'trump_tfidf.pkl')

joblib.dump(ocsvm, 'ocsvm.pkl')

joblib.dump(w2v_model, 'w2v.pkl')

['w2v.pkl']