# Detecting Stance in Tweets

**Reading the dataset and pre-processing**

In [1]:
from Model import Model as DataModel

dataset = []
for filename in ["../dataset_raw/semeval2016-task6-trialdata.txt", "../dataset_raw/semeval2016-task6-trainingdata.txt"]:
    f = open(filename, 'r')
    f.readline() # Skip the first line which contains the title

    for line in f.readlines():
        items = line.strip().split('\t')
        t = DataModel(items[0] , items[1], items[2], items[3])
        dataset.append(t)

targets = list(set(map(lambda model:model.target, dataset)))
stances = list(set(map(lambda model: model.stance, dataset)))

print "Stances = ", stances
print "Targets = ", targets

Stances =  ['FAVOR', 'NONE', 'AGAINST']
Targets =  ['Atheism', 'Climate Change is a Real Concern', 'Feminist Movement', 'Legalization of Abortion', 'Hillary Clinton']


**Reading the Stanford GloVe Twitter Embeddings learned over Twitter data**

In [2]:
import numpy as np

glove_word_vec_file = "../glove.twitter.27B/glove.twitter.27B.200d.txt"

def readGloveData(glove_word_vec_file):
    f = open(glove_word_vec_file, 'r')
    rawData = f.readlines()
    word_vec_dict = {}
    for line in rawData:
        line = line.strip().split()
        tag = line[0]
        vec = line[1:]
        word_vec_dict[tag] = np.array(vec, dtype=float)
            
    return word_vec_dict
            
word_vec_dict = readGloveData(glove_word_vec_file)

def getWordVector(word):
    if word in word_vec_dict:
        return word_vec_dict[word]
    return np.zeros_like(word_vec_dict['hi'])

def getSumVectors(tweetData):
    numNonZero = 0
    vector = np.zeros_like(word_vec_dict['hi'])
    
    for word in tweetData:
        vec = getWordVector(word)
        vector = vector + vec
        if vec.sum() != 0:
            numNonZero += 1

    if numNonZero:
        vector = vector / numNonZero

    return vector

**Preprocess tweets according to various heuristics**

In [93]:
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain, imap

target = targets[3]
print "Target = ", target
currentDataset = [d for d in dataset if d.target == target or d.stance == "NONE"]

mainStance = stances[1]
print "Stance = ", mainStance

le = LabelEncoder()
target = le.fit([model.stance == mainStance for model in currentDataset])

tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
stopwords = stopwords.words("English")
stopwords.extend(['#', ','])

def processHashTags(hashtag):
    _, _, htWord = hashtag.partition('#')
    return [htWord]

def transformTweetData(tweet):
    content = unicode(tweet.tweet_content, errors='ignore')
    words = content.strip().split()
    
    extra_features = []
    for word in words:
        if word.startswith('#'):
            extra_features.extend(processHashTags(word))
    
    content = " ".join(words + extra_features)
    tokens = tknzr.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    
    return tokens
    
data = list(map(transformTweetData, currentDataset))
classes = le.transform([model.stance == mainStance for model in currentDataset])
print set(classes)

Target =  Legalization of Abortion
Stance =  NONE
set([0, 1])


In [94]:
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from itertools import chain

#featureEncoder = MultiLabelBinarizer(classes = set(chain.from_iterable(data)))
#feature_array = np.asarray([reduce(sum, map(featureEncoder.transform, d)) for d in data])
feature_array = np.asarray([getSumVectors(d) for d in data])

#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,)
#feature_array = vectorizer.fit_transform([" ".join(d) for d in data])
    
class_array = np.asarray(classes)

print feature_array.shape, class_array.shape
print "Stance = ", mainStance, le.classes_

clf = SVC(kernel='linear')
skf = cross_validation.StratifiedShuffleSplit(class_array, n_iter=7, test_size=0.3)

idx = 1
for train, test in skf:
    X_train = feature_array[np.array(train)]
    Y_train = class_array[np.array(train)] 
    
    X_test = feature_array[np.array(test)]
    Y_test = class_array[np.array(test)]
    
    clf.fit(X_train, Y_train)
    predict = clf.predict(X_test)
    
    print "Fold /", idx, ", Training Set /", Y_train.shape, " ", np.sum(Y_train), ", Test Set /", Y_test.shape, " ", np.sum(Y_test)
    print metrics.confusion_matrix(Y_test, predict), metrics.accuracy_score(Y_test, predict), metrics.f1_score(Y_test, predict)
    idx += 1


(1242, 200) (1242,)
Stance =  NONE [False  True]
Fold / 1 , Training Set / (869,)   536 , Test Set / (373,)   230
[[121  22]
 [ 30 200]] 0.860589812332 0.884955752212
Fold / 2 , Training Set / (869,)   536 , Test Set / (373,)   230
[[117  26]
 [ 25 205]] 0.86327077748 0.889370932755
Fold / 3 , Training Set / (869,)   536 , Test Set / (373,)   230
[[113  30]
 [ 29 201]] 0.8418230563 0.872017353579
Fold / 4 , Training Set / (869,)   536 , Test Set / (373,)   230
[[116  27]
 [ 36 194]] 0.83109919571 0.860310421286
Fold / 5 , Training Set / (869,)   536 , Test Set / (373,)   230
[[105  38]
 [ 23 207]] 0.836461126005 0.871578947368
Fold / 6 , Training Set / (869,)   536 , Test Set / (373,)   230
[[116  27]
 [ 27 203]] 0.855227882038 0.882608695652
Fold / 7 , Training Set / (869,)   536 , Test Set / (373,)   230
[[113  30]
 [ 25 205]] 0.85254691689 0.881720430108


In [118]:
print np.dot(clf.support_vectors_[-5], getWordVector("climate"))

9.04856098309


(239, 200)

In [113]:
print clf.decision_function([clf.support_vectors_[-1]])

[ 1.00033653]
