In [1]:
#Imports
import json
import nltk
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC, SVR, LinearSVR
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from cleantext import *


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annafernandezrajal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Load models Sentiment
nltk.download('vader_lexicon')
nltk.download('punkt')
sentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/annafernandezrajal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/annafernandezrajal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Settings
pd.set_option('display.width', 1000)

In [3]:
#Load data
twitter_data = pd.read_csv('dataset/train.csv', encoding="unicode_escape", lineterminator='\r')

twitter_test = pd.read_csv('dataset/test.csv', encoding="unicode_escape", lineterminator='\r')
twitter_data.head(10)

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos
5,Hillary can not win. Here's hoping the Dems of...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
6,Respect FOR the law and respect BY the law Yes...,Hillary Clinton,NONE,2. The tweet does NOT expresses opinion about ...,pos
7,I don't want to be appointed to an Ambassador ...,Hillary Clinton,NONE,2. The tweet does NOT expresses opinion about ...,neg
8,#StopHillary2016 @HillaryClinton if there was ...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
9,@HillaryClinton End lawless #ClintonFoundation...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg


In [4]:
#Calculate Sentiment
def calculateSentiment(text):
    return sentimentIntensityAnalyzer.polarity_scores(text)['compound']


def sentiment(dataframe ,results):
    result = []
    for value in results:
        if value > 0.61:
            result.append("pos")
        else:
            result.append("neg")
    dataframe['sentimentVader'] = result

results = twitter_data['Tweet'].apply(calculateSentiment)
twitter_data['sentimentVader'] = results
# sentiment(twitter_data, results)


# print("Accuracy score: ", accuracy_score(twitter_data['Sentiment'] , twitter_data['sentimentVader'] ))

display(twitter_data)

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment,sentimentVader
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,0.4019
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos,0.8126
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,0.3612
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,-0.5267
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos,0.3612
...,...,...,...,...,...,...
2909,"There's a law protecting unborn eagles, but no...",Legalization of Abortion,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,0.1139
2910,I am 1 in 3... I have had an abortion #Abortio...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,other,0.0000
2911,How dare you say my sexual preference is a cho...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg,0.0000
2912,"Equal rights for those 'born that way', no rig...",Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg,-0.2960


In [5]:
# Calculate Stance Detection

training_pre = pd.read_csv('dataset/train.csv', encoding="unicode_escape", lineterminator='\r')
test_pre = pd.read_csv('dataset/test.csv', encoding="unicode_escape", lineterminator='\r')


def tfidf(training, test):
    training['text_clean'] = clean_text(training, 'Tweet')
    test['text_clean'] = clean_text(test, 'Tweet')

    all_text = training['text_clean'].values.tolist() + test['text_clean'].values.tolist()
    vocab = flatten_words(all_text, get_unique=True)
    tfidf = TfidfVectorizer(stop_words='english', vocabulary=vocab)
    training_matrix = tfidf.fit_transform(training.text_clean)
    test_matrix = tfidf.fit_transform(test.text_clean)
    training.drop('Target', inplace=True, axis=1)
    label_train = training["Stance"] 
    training.drop('Stance', inplace=True, axis=1)
    training.drop('Opinion Towards', inplace=True, axis=1)
    training.drop('Sentiment', inplace=True, axis=1)
    test.drop('Target', inplace=True, axis=1)
    label_test = test["Stance"] 
    test.drop('Stance', inplace=True, axis=1)
    test.drop('Opinion Towards', inplace=True, axis=1)
    test.drop('Sentiment', inplace=True, axis=1)

    training = features(training)
    training = pd.concat([training, pd.DataFrame(training_matrix.todense())], axis=1)

    test = features(test)
    test = pd.concat([test, pd.DataFrame(test_matrix.todense())], axis=1)

    return training, test, label_train, label_test


#Pre-processinf TF-IDF

training, test, label_train, label_test = tfidf(training_pre, test_pre)


#svm= SVC(kernel = "poly", degree=4)
svm = LinearSVC(dual=False, max_iter=5000)
svm_reg = LinearSVR(dual=True, max_iter=5000)
features = training.columns[2:]
X = training[features].values

def enconde(label_train):
    label_encoder = LabelEncoder()
    label_encoded = label_encoder.fit_transform(label_train)
    return label_encoded

svm.fit(X, label_train)
label_encoded = enconde(label_train)
svm_reg.fit(X, label_encoded)

features_tr = training[features].values

print(features_tr)

predicted_svm = svm.predict(features_tr)
predicted_reg = svm_reg.predict(features_tr)
print("Accuracy score SVM: ", accuracy_score(label_train, predicted_svm))
#print("Accuracy score SVR: ", accuracy_score(label_encoded, predicted_reg))


twitter_data["StanceSVM"] = predicted_reg
twitter_data.head()




[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 2. ... 0. 0. 0.]]
Accuracy score SVM:  0.9979409746053535


Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment,sentimentVader,StanceSVM
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,0.4019,-0.004432
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos,0.8126,0.998178
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,0.3612,0.115772
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,-0.5267,-0.004232
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos,0.3612,1.999076


In [6]:
## TESTING

#Sentiment
results = twitter_test['Tweet'].apply(calculateSentiment)
results_regression_sent = results.copy()
sentiment(twitter_test, results)

#Stance
features_dev = test[features].values
dev_predicted = svm.predict(features_dev)

dev_predicted_reg = svm_reg.predict(features_dev)

twitter_test["StanceSVM"] = dev_predicted


sentiment_score = accuracy_score(twitter_test["Sentiment"], twitter_test["sentimentVader"])
print("Sentiment Accuracy on test:", sentiment_score)

stance_score = accuracy_score(twitter_test["Stance"], twitter_test["StanceSVM"])
print("Stance Accuracy on test:", stance_score)

Sentiment Accuracy on test: 0.6958077709611452
Stance Accuracy on test: 0.523517382413088


In [7]:
twitter_test.head()
twitter_test.to_csv('output_test_classification.csv')
print("saved")

saved


In [8]:
twitter_data.to_csv('output_train_regression.csv')
print("saved")

saved


In [10]:
#Sentiment
twitter_test["sentimentVader"] = results_regression_sent

#Stance
twitter_test["StanceSVM"] = dev_predicted_reg

twitter_test.head()
twitter_test.to_csv('output_test_regression.csv')
print("saved")


saved
