# Detecting Stance in Tweets

**Reading the dataset and pre-processing**

In [1]:
from Model import Model as DataModel

dataset = []

targets_small_hack = ['atheism', 'climate change is a real concern', 'feminist movement', 'legalization of abortion', 'hillary clinton']
targets_skip_index = [1, 6, 3, 4, 3]
targets_real =  ['Atheism', 'Climate Change is a Real Concern', 'Feminist Movement', 'Legalization of Abortion', 'Hillary Clinton']

f = open("../final.txt", 'r')
for line in f.readlines():
    items = line.strip().split('\t')
    
    for t in xrange(5):
        if items[0].startswith(targets_small_hack[t]):
            model = DataModel(None, targets_real[t], items[0][targets_skip_index[t]:], items[-1])
            dataset.append(model)
            
targets = list(set(map(lambda model:model.target, dataset)))
stances = list(set(map(lambda model: model.stance, dataset)))

print "Stances = ", stances
print "Targets = ", targets

Stances =  ['FAVOR', 'NONE', 'AGAINST']
Targets =  ['Atheism', 'Legalization of Abortion', 'Feminist Movement', 'Climate Change is a Real Concern', 'Hillary Clinton']


**Reading the Stanford GloVe Twitter Embeddings learned over Twitter data**

In [2]:
import numpy as np

glove_word_vec_file = "../glove.twitter.27B/glove.twitter.27B.200d.txt"

def readGloveData(glove_word_vec_file):
    f = open(glove_word_vec_file, 'r')
    rawData = f.readlines()
    word_vec_dict = {}
    for line in rawData:
        line = line.strip().split()
        tag = line[0]
        vec = line[1:]
        word_vec_dict[tag] = np.array(vec, dtype=float)
            
    return word_vec_dict
            
word_vec_dict = readGloveData(glove_word_vec_file)

def getWordVector(word):
    if word in word_vec_dict:
        return word_vec_dict[word]
    return np.zeros_like(word_vec_dict['hi'])

def getSumVectors(tweetData):
    numNonZero = 0
    vector = np.zeros_like(word_vec_dict['hi'])
    
    for word in tweetData:
        vec = getWordVector(word)
        vector = vector + vec
        if vec.sum() != 0:
            numNonZero += 1

    if numNonZero:
        vector = vector / numNonZero

    return vector

**Preprocess tweets according to various heuristics**

In [3]:
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain, imap

tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
stopwords = stopwords.words("English")
stopwords.extend(['#', ',', '+'])

def processHashTags(hashtag):
    _, _, htWord = hashtag.partition('#')
    return [htWord]

def transformTweetData(tweet):
    content = unicode(tweet.tweet_content, errors='ignore')
    words = content.strip().split()
    
    extra_features = []
    for word in words:
        if word.startswith('#'):
            extra_features.extend(processHashTags(word))
    
    content = " ".join(words + extra_features)
    tokens = tknzr.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    
    return tokens

In [19]:
%load_ext autoreload
%autoreload 2

from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.externals import joblib

from itertools import chain
from sentimentAnalysis import *

for target in targets:
    print "Target = ", target
    currentDataset = [d for d in dataset if d.target == target and d.stance != "NONE"]

    part_2_le = LabelEncoder()
    part_2_le.fit(["FAVOR", "AGAINST"])
    
    data = list(map(transformTweetData, currentDataset))

    classes = part_2_le.transform([model.stance for model in currentDataset])
    
    feature_array = np.asarray(getFeatures(data, word_vec_dict))
    class_array = np.asarray(classes)

    print feature_array.shape, class_array.shape

    clf = SVC(kernel='linear')
    predict = cross_validation.cross_val_predict(clf, feature_array, class_array)
    
    print metrics.accuracy_score(class_array, predict)
    print metrics.f1_score(class_array, predict)
    print metrics.confusion_matrix(class_array, predict)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Target =  Atheism
(396, 602) (396,)
0.777777777778
0.52688172043
[[259  45]
 [ 43  49]]
Target =  Legalization of Abortion
(476, 602) (476,)
0.720588235294
0.494296577947
[[278  77]
 [ 56  65]]
Target =  Feminist Movement
(538, 602) (538,)
0.620817843866
0.529953917051
[[219 109]
 [ 95 115]]
Target =  Climate Change is a Real Concern
(227, 602) (227,)
0.916299559471
0.955916473318
[[  2  13]
 [  6 206]]
Target =  Hillary Clinton
(511, 602) (511,)
0.686888454012
0.416058394161
[[294  99]
 [ 61  57]]
