In [1]:
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
import numpy as np
import re

In [2]:
links = re.compile("http[s]?://[a-zA-z./\d]*")
punc = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~￥%……&*（）]+")
punc1 = re.compile("[【】╮╯▽╰╭★→「」]+")
punc2 = re.compile("！，❤。～《》：（）【】「」？”“；：、")
fourdots = re.compile("\.*")

def clean(line):
    line = links.sub(' ',line)
    line = punc.sub(' ',line)
    line = punc1.sub(' ',line)
    line = punc2.sub(' ',line)
    line = fourdots.sub('',line)
    return(line)    

In [3]:
with open('./Data/pos_tweets.txt', 'r') as infile:
    pos_tweets = []
    for line in infile:
        pos_tweets.append(clean(line))

with open('./Data/neg_tweets.txt', 'r') as infile:
    neg_tweets = []
    for line in infile:
        neg_tweets.append(clean(line))

y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets)))).astype(int).astype(str)

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)

train = []
trainonly = []
for i, (xline,yline) in enumerate(zip(x_train,y_train)):
    train.append(TaggedDocument(xline.lower().split(),[i,yline]))
    trainonly.append(TaggedDocument(xline.lower().split(),[i]))

modeltags = Doc2Vec(vector_size=100, min_count=3, epochs= 200, workers= 4)
modeltags.build_vocab(train)
modeltags.train(train,total_examples=modeltags.corpus_count,epochs=modeltags.epochs)

modelonly = Doc2Vec(vector_size=100, min_count=3, epochs= 200, workers= 4)
modelonly.build_vocab(trainonly)
modelonly.train(trainonly,total_examples=modelonly.corpus_count,epochs=modelonly.epochs)

In [4]:
pos_vec = modeltags.docvecs['1']
pos_vec = pos_vec/np.linalg.norm(pos_vec)
neg_vec = modeltags.docvecs['0']
neg_vec = neg_vec/np.linalg.norm(neg_vec)

In [5]:
inner_true = []
inner_vecs = []
for i in train:
    vec = modeltags.infer_vector(i[0])
    vec = vec/np.linalg.norm(vec)
    inner_true.append(i[1][1])
    inner_vecs.append(vec)


inner_modelonly_vecs = []
for i in trainonly:
    vec = modelonly.infer_vector(i[0])
    vec = vec/np.linalg.norm(vec)
    inner_modelonly_vecs.append(vec)

In [6]:
test = []
testonly = []
for i, (xline,yline) in enumerate(zip(x_test,y_test)):
    test.append(TaggedDocument(xline.split(),[i,yline]))
    testonly.append(TaggedDocument(xline.split(),[i]))



outer_diff = [] 
outer_true = []
outer_vecs = []
for i in test:
    vec = modeltags.infer_vector(i[0])
    vec = vec/np.linalg.norm(vec)
    outer_diff.append(np.dot(vec,pos_vec)-np.dot(vec,neg_vec))
    outer_true.append(i[1][1])
    outer_vecs.append(vec)
    

outer_modelonly_vecs = []
for i in testonly:
    vec = modelonly.infer_vector(i[0])
    vec = vec/np.linalg.norm(vec)
    outer_modelonly_vecs.append(vec)


clf = LogisticRegression()
clf.fit(inner_vecs,inner_true)
outer_prob = clf.predict_proba(outer_vecs)[:,1]

clf1 = LogisticRegression()
clf1.fit(inner_modelonly_vecs,inner_true)
outer_modelonly_prob = clf1.predict_proba(outer_modelonly_vecs)[:,1]

In [7]:
from gensim.models.word2vec import Word2Vec
train_words = [i.words for i in train]

modelwords = Word2Vec(size=100, min_count=3, workers= 4)
modelwords.build_vocab(train_words)
modelwords.train(train_words,total_examples=modelwords.corpus_count,epochs=modelwords.epochs)

(67621, 116150)

In [8]:
innertwitter_vecs = []
for tweet in train_words:
    vec = np.zeros(shape=(100,))
    for word in tweet:
        counter = 0
        if word in modelwords.wv:
            vec += modelwords.wv[word]
            counter += 1
    if counter > 1:
        vec = vec/counter
        vec = vec/np.linalg.norm(vec)      
    innertwitter_vecs.append(vec)

test_words = [i.words for i in test]

outertwitter_vecs = []
for tweet in test_words:
    vec = np.zeros(shape=(100,))
    for word in tweet:
        counter = 0
        if word in modelwords.wv:
            vec += modelwords.wv[word]
            counter += 1
    if counter > 1:
        vec = vec/counter
        vec = vec/np.linalg.norm(vec)      
    outertwitter_vecs.append(vec)

In [9]:
clf2 = LogisticRegression(max_iter=10000)
clf2.fit(innertwitter_vecs,inner_true)
outertwitter_prob = clf2.predict_proba(outertwitter_vecs)[:,1]

In [10]:
print("TagsDotProduct: ",average_precision_score(outer_true,outer_diff,pos_label='1'))
print("TagsLR:", average_precision_score(outer_true,outer_prob,pos_label='1'))
print("TaglessLR: ", average_precision_score(outer_true,outer_modelonly_prob,pos_label='1'))
print("VectorMean: ", average_precision_score(outer_true,outertwitter_prob,pos_label='1'))

tags:  0.6624093539394535
tagsLR: 0.6783857926837931
taglessLR:  0.6315850012170042
meanvect:  0.4373201141185392
