# Detecting Stance in Tweets

**Reading the dataset and pre-processing**

In [3]:
from Model import Model as DataModel

dataset = []
# for filename in ["../dataset_raw/semeval2016-task6-trialdata.txt", "../dataset_raw/semeval2016-task6-trainingdata.txt"]:
#     f = open(filename, 'r')
#     f.readline() # Skip the first line which contains the title

#     for line in f.readlines():
#         items = line.strip().split('\t')
#         t = DataModel(items[0] , items[1], items[2], items[3])
#         dataset.append(t)

targets_small_hack = ['atheism', 'climate change is a real concern', 'feminist movement', 'legalization of abortion', 'hillary clinton']
targets_skip_index = [1, 6, 3, 4, 3]
targets_real =  ['Atheism', 'Climate Change is a Real Concern', 'Feminist Movement', 'Legalization of Abortion', 'Hillary Clinton']

f = open("../final.txt", 'r')
for line in f.readlines():
    items = line.strip().split('\t')
    
    for t in xrange(5):
        if items[0].startswith(targets_small_hack[t]):
            model = DataModel(None, targets_real[t], items[0][targets_skip_index[t]:], items[-1])
            dataset.append(model)
            
targets = list(set(map(lambda model:model.target, dataset)))
stances = list(set(map(lambda model: model.stance, dataset)))

print "Stances = ", stances
print "Targets = ", targets

Stances =  ['FAVOR', 'NONE', 'AGAINST']
Targets =  ['Atheism', 'Legalization of Abortion', 'Feminist Movement', 'Climate Change is a Real Concern', 'Hillary Clinton']


**Reading the Stanford GloVe Twitter Embeddings learned over Twitter data**

In [4]:
import numpy as np

glove_word_vec_file = "../glove.twitter.27B/glove.twitter.27B.200d.txt"

def readGloveData(glove_word_vec_file):
    f = open(glove_word_vec_file, 'r')
    rawData = f.readlines()
    word_vec_dict = {}
    for line in rawData:
        line = line.strip().split()
        tag = line[0]
        vec = line[1:]
        word_vec_dict[tag] = np.array(vec, dtype=float)
            
    return word_vec_dict
            
word_vec_dict = readGloveData(glove_word_vec_file)

def getWordVector(word):
    if word in word_vec_dict:
        return word_vec_dict[word]
    return np.zeros_like(word_vec_dict['hi'])

def getSumVectors(tweetData):
    numNonZero = 0
    vector = np.zeros_like(word_vec_dict['hi'])
    
    for word in tweetData:
        vec = getWordVector(word)
        vector = vector + vec
        if vec.sum() != 0:
            numNonZero += 1

    if numNonZero:
        vector = vector / numNonZero

    return vector

**Preprocess tweets according to various heuristics**

In [6]:
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain, imap

tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
stopwords = stopwords.words("English")
stopwords.extend(['#', ',', '+'])

def processHashTags(hashtag):
    _, _, htWord = hashtag.partition('#')
    return [htWord]

def transformTweetData(tweet):
    content = unicode(tweet.tweet_content, errors='ignore')
    words = content.strip().split()
    
    extra_features = []
    for word in words:
        if word.startswith('#'):
            extra_features.extend(processHashTags(word))
    
    content = " ".join(words + extra_features)
    tokens = tknzr.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    
    return tokens

In [15]:
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.externals import joblib

from itertools import chain

for target in targets:
    print "Target = ", target
    currentDataset = [d for d in dataset if d.target == target or d.stance == "NONE"]

    le = LabelEncoder()
    mapping = {"NONE": "NONE", "FAVOR": "RELEVANT", "AGAINST": "RELEVANT"}
    le.fit([mapping[model.stance] for model in currentDataset])

    data = list(map(transformTweetData, currentDataset))
    classes = le.transform([mapping[model.stance] for model in currentDataset])
    print set(classes)

    feature_array = np.asarray([getSumVectors(d) for d in data])
    
    class_array = np.asarray(classes)

    print feature_array.shape, class_array.shape
    print le.classes_

    skf = cross_validation.StratifiedShuffleSplit(class_array, n_iter=5, test_size=0.3)

    idx = 0
    average_accuracy = np.zeros(5)
    average_f1 = np.zeros(5)
    
    for train, test in skf:
        clf = SVC(kernel='linear')
        X_train = feature_array[np.array(train)]
        Y_train = class_array[np.array(train)] 
    
        X_test = feature_array[np.array(test)]
        Y_test = class_array[np.array(test)]
    
        clf.fit(X_train, Y_train)
        predict = clf.predict(X_test)
    
        print "Fold /", idx + 1, ", Training Set /", Y_train.shape, " ", np.sum(Y_train), ", Test Set /", Y_test.shape, " ", np.sum(Y_test)
        print metrics.confusion_matrix(Y_test, predict), metrics.accuracy_score(Y_test, predict), metrics.f1_score(Y_test, predict)
        
        average_accuracy[idx] = metrics.accuracy_score(Y_test, predict)
        average_f1[idx] = metrics.f1_score(Y_test, predict)
        
        idx += 1
    
    
    print "Average Accuracy =", np.mean(average_accuracy)
    print "Average F-1 =", np.mean(average_f1)
    target_short = target.split()[0]
    joblib.dump(le, "Part1_VectorSum_Clf/Part1_VectorSumLabelEncoder_Target_" + target_short + ".pkl")
    joblib.dump(clf, "Part1_VectorSum_Clf/Part1_VectorSum_Target_" + target_short + ".pkl")

Target =  Atheism
Stance =  NONE
set([0, 1])
(1162, 200) (1162,)
Stance =  NONE [False  True]
Fold / 1 , Training Set / (813,)   536 , Test Set / (349,)   230
[[104  15]
 [  8 222]] 0.934097421203 0.950749464668
Fold / 2 , Training Set / (813,)   536 , Test Set / (349,)   230
[[115   4]
 [ 17 213]] 0.939828080229 0.953020134228
Fold / 3 , Training Set / (813,)   536 , Test Set / (349,)   230
[[107  12]
 [ 19 211]] 0.9111747851 0.931567328918
Fold / 4 , Training Set / (813,)   536 , Test Set / (349,)   230
[[111   8]
 [ 23 207]] 0.9111747851 0.930337078652
Fold / 5 , Training Set / (813,)   536 , Test Set / (349,)   230
[[108  11]
 [ 15 215]] 0.925501432665 0.94298245614
Average Accuracy = 0.92435530086
Average F-1 = 0.941731292521


AttributeError: 'module' object has no attribute 'dumps'

In [16]:
from sklearn.externals import joblib
joblib.dump

<function sklearn.externals.joblib.numpy_pickle.dump>