# Detecting Stance in Tweets

**Reading the dataset and pre-processing**

In [None]:
from Model import Model as DataModel

dataset = []
# for filename in ["../dataset_raw/semeval2016-task6-trialdata.txt", "../dataset_raw/semeval2016-task6-trainingdata.txt"]:
#     f = open(filename, 'r')
#     f.readline() # Skip the first line which contains the title

#     for line in f.readlines():
#         items = line.strip().split('\t')
#         t = DataModel(items[0] , items[1], items[2], items[3])
#         dataset.append(t)

targets_small_hack = ['atheism', 'climate change is a real concern', 'feminist movement', 'legalization of abortion', 'hillary clinton']
targets_skip_index = [1, 6, 3, 4, 3]
targets_real =  ['Atheism', 'Climate Change is a Real Concern', 'Feminist Movement', 'Legalization of Abortion', 'Hillary Clinton']

f = open("../final.txt", 'r')
for line in f.readlines():
    items = line.strip().split('\t')
    
    for t in xrange(5):
        if items[0].startswith(targets_small_hack[t]):
            model = DataModel(None, targets_real[t], items[0][targets_skip_index[t]:], items[-1])
            dataset.append(model)
            
targets = list(set(map(lambda model:model.target, dataset)))
stances = list(set(map(lambda model: model.stance, dataset)))

print "Stances = ", stances
print "Targets = ", targets

**Preprocess tweets according to various heuristics**

In [None]:
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain, imap

tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)
stopwords = stopwords.words("English")
stopwords.extend(['#', ',', '+'])

def processHashTags(hashtag):
    _, _, htWord = hashtag.partition('#')
    return [htWord]

def transformTweetData(tweet):
    content = unicode(tweet.tweet_content, errors='ignore')
    words = content.strip().split()
    
    extra_features = []
    for word in words:
        if word.startswith('#'):
            extra_features.extend(processHashTags(word))
    
    content = " ".join(words + extra_features)
    tokens = tknzr.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    
    return tokens

In [None]:
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

import numpy as np
from itertools import chain

for target in targets:
    print "Target = ", target
    currentDataset = [d for d in dataset if d.target == target or d.stance == "NONE"]

    le = LabelEncoder()
    mapping = {"NONE": "NONE", "FAVOR": "RELEVANT", "AGAINST": "RELEVANT"}
    le.fit([mapping[model.stance] for model in currentDataset])

    data = list(map(transformTweetData, currentDataset))
    classes = le.transform([mapping[model.stance] for model in currentDataset])
    print set(classes)

    # Stopwords are already cleaned from the corpus
    cvect = CountVectorizer(stop_words=None, binary=True, min_df=3, 
                            max_df=9000, analyzer='word', 
                            tokenizer=None)
    
    feature_array = cvect.fit_transform([" ".join(d) for d in data]).toarray()
    
    class_array = np.asarray(classes)

    print feature_array.shape, class_array.shape
    print le.classes_

    skf = cross_validation.StratifiedShuffleSplit(class_array, n_iter=5, test_size=0.3)

    idx = 0
    average_accuracy = np.zeros(5)
    average_f1 = np.zeros(5)
    
    for train, test in skf:
        clf = SVC(kernel='linear')
        X_train = feature_array[np.array(train)]
        Y_train = class_array[np.array(train)] 
    
        X_test = feature_array[np.array(test)]
        Y_test = class_array[np.array(test)]
    
        clf.fit(X_train, Y_train)
        predict = clf.predict(X_test)
    
        print "Fold /", idx + 1, ", Training Set /", Y_train.shape, " ", np.sum(Y_train), ", Test Set /", Y_test.shape, " ", np.sum(Y_test)
        print metrics.confusion_matrix(Y_test, predict), metrics.accuracy_score(Y_test, predict), metrics.f1_score(Y_test, predict)
        
        average_accuracy[idx] = metrics.accuracy_score(Y_test, predict)
        average_f1[idx] = metrics.f1_score(Y_test, predict)
        
        idx += 1
    
    print "Average Accuracy =", np.mean(average_accuracy)
    print "Average F-1 =", np.mean(average_f1)
    target_short = target.split()[0]
    joblib.dump(le, "Part1_OneHot_Dump/Part1_OneHotLabelEncoder_Target_" + target_short + ".pkl")
    joblib.dump(clf, "Part1_OneHot_Dump/Part1_OneHot_Target_" + target_short + ".pkl")
    joblib.dump(cvect, "Part1_OneHot_Dump/Part1_OneHotVectorizer_Target_" + target_short + ".pkl")