In [1]:
import pandas as pd
import numpy as np
import re
import scipy

### Creating dataframe from csv

In [2]:
df = pd.read_csv('tweet_csvs/realDonaldTrump_tweets.csv', index_col = None, header = 0, 
                     parse_dates=['created_at'], infer_datetime_format = True, dayfirst = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3318 entries, 0 to 3317
Data columns (total 3 columns):
id            3318 non-null int64
created_at    3318 non-null datetime64[ns]
text          3318 non-null object
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 77.8+ KB


In [4]:
df.head(2)

Unnamed: 0,id,created_at,text
0,820251730407473153,2017-01-14 12:50:26,Congressman John Lewis should spend more time ...
1,820255947956383744,2017-01-14 13:07:12,mention crime infested) rather than falsely co...


In [5]:
df.tail(2)

Unnamed: 0,id,created_at,text
3316,1004100003185426432,2018-06-05 20:37:50,https://t.co/4OjDqTMEIx
3317,1004092260475162624,2018-06-05 20:07:04,Imagine how much wasteful spending we’d save i...


### Text Pre-processing

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect = TfidfVectorizer(ngram_range=(1, 1), stop_words = 'english')

# Pulls all of trumps tweet text's into one giant string
summaries = "".join(df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

Counter(ngrams_summaries).most_common(20)

[('https', 1310),
 ('great', 679),
 ('amp', 569),
 ('people', 317),
 ('news', 277),
 ('trump', 242),
 ('fake', 241),
 ('president', 238),
 ('just', 229),
 ('country', 228),
 ('america', 227),
 ('rt', 222),
 ('big', 211),
 ('thank', 195),
 ('tax', 190),
 ('today', 184),
 ('american', 176),
 ('time', 172),
 ('jobs', 171),
 ('democrats', 156)]

In [7]:
df_train = df['text'].tolist()

In [8]:
trump_tfidf = vect.fit_transform(df_train)

In [9]:
#test sentence
#sentence = 'To all the little girls watching...never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world.'
test = ['Come on and kill Kenny!', 
        'Make America great again!', 
        'Beer... Beeeeer... Beeeeeeeeer... WOO-HOO!']

In [10]:
test_tfidf = vect.transform(test)

In [11]:
test_tfidf.shape

(3, 8161)

In [12]:
test_len = test_tfidf.shape[0]
test_len

3

### OneClassSVM

In [13]:
from sklearn import svm

In [14]:
ocsvm = svm.OneClassSVM(nu = 0.5, kernel = 'rbf', gamma = 0.1)

In [15]:
y_true = [1 for i in range(trump_tfidf.shape[0])]

In [16]:
ocsvm.fit(trump_tfidf, y = y_true)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
      max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [17]:
prediction = ocsvm.predict(test_tfidf)

### Cosine Similarity

In [18]:
from sklearn.metrics.pairwise import cosine_similarity 

In [19]:
i = 0

for sentence in test:
    sentence_tfidf = vect.transform([sentence])
    cos_dists = cosine_similarity(trump_tfidf, sentence_tfidf)
    print('>>> {}'.format(sentence))
    print('Cosine similarity: {}'.format(round(np.max(cos_dists), 2)))
    print('OneClassSVM prediction: {}\n'.format(prediction[i]))
    i += 1

>>> Come on and kill Kenny!
Cosine similarity: 0.33
OneClassSVM prediction: -1

>>> Make America great again!
Cosine similarity: 1.0
OneClassSVM prediction: 1

>>> Beer... Beeeeer... Beeeeeeeeer... WOO-HOO!
Cosine similarity: 0.0
OneClassSVM prediction: 1

