In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

In [2]:
df = pd.read_csv('tweets.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,date,id,link,retweet,text,author
0,Oct 7,784609194234306560,/realDonaldTrump/status/784609194234306560,False,Here is my statement.pic.twitter.com/WAZiGoQqMQ,DonaldTrump
1,Oct 10,785608815962099712,/realDonaldTrump/status/785608815962099712,False,Is this really America? Terrible!pic.twitter.c...,DonaldTrump
2,Oct 8,784840992734064640,/realDonaldTrump/status/784840992734064641,False,The media and establishment want me out of the...,DonaldTrump
3,Oct 8,784767399442653184,/realDonaldTrump/status/784767399442653184,False,Certainly has been an interesting 24 hours!,DonaldTrump
4,Oct 10,785561269571026944,/realDonaldTrump/status/785561269571026946,False,Debate polls look great - thank you!\r\n#MAGA ...,DonaldTrump


In [4]:
df.shape

(17216, 6)

In [5]:
corpus_text = '\n'.join(df['text'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]
for i in range(5):
    print(sentences[i])

['here', 'is', 'my', 'statement.pic.twitter.com/wazigoqqmq']
['is', 'this', 'really', 'america?', 'terrible!pic.twitter.com/wiwc61pifu']
['the', 'media', 'and', 'establishment', 'want', 'me', 'out', 'of', 'the', 'race', 'so', 'badly', '-', '', 'i', 'will', 'never', 'drop', 'out', 'of', 'the', 'race,', 'will', 'never', 'let', 'my', 'supporters', 'down!', '#maga']
['certainly', 'has', 'been', 'an', 'interesting', '24', 'hours!']
['debate', 'polls', 'look', 'great', '-', 'thank', 'you!\r']


In [6]:
def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]
for i in range(5):
    print(sentences[i])

['here', 'is', 'my', 'statement.pic.twitter.com/wazigoqqmq']
['is', 'this', 'really', 'america', 'terrible!pic.twitter.com/wiwc61pifu']
['the', 'media', 'and', 'establishment', 'want', 'me', 'out', 'of', 'the', 'race', 'so', 'badly', '-', '', 'i', 'will', 'never', 'drop', 'out', 'of', 'the', 'race', 'will', 'never', 'let', 'my', 'supporters', 'down', '#maga']
['certainly', 'has', 'been', 'an', 'interesting', '24', 'hours']
['debate', 'polls', 'look', 'great', '-', 'thank', 'you!\r']


In [7]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=40, window=2)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
new_sentence = "i'm going to France for my hollidays".split(" ")  
result = model.docvecs.most_similar(positive=[model.infer_vector(new_sentence)],topn=5)

In [9]:
for tag, score in result:
    print("Sentence : ", documents[tag].words," | Score :", score )

Sentence :  ['my', 'parents:\r']  | Score : 0.8574124574661255
Sentence :  ['making', 'his', 'case', 'in', 'a', 'nice', 'and', 'articulate', 'manner']  | Score : 0.806242823600769
Sentence :  ['here', 'is', 'my', 'statement.pic.twitter.com/wazigoqqmq']  | Score : 0.8033185005187988
Sentence :  ['@neneleakes', 'seeks', 'my', 'advice', 'on', 'prenups', 'tonight', 'at', '9', 'pm', 'on', 'bravo']  | Score : 0.7755637168884277
Sentence :  ['thank', 'you', 'for', 'your', 'support!http://www.usnews.com/news/blogs/run-2016/2016/01/28/with-his-absence-trump-dominates-another-debate\xa0…']  | Score : 0.7738782167434692


In [10]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('documents.pkl', 'wb') as f:
    pickle.dump(documents, f)