In [1]:
import numpy as np
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
from collections import Counter
import codecs
from nltk.corpus import stopwords
import nltk
import csv
import random
import time
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
raw_dataframe = pd.read_csv("../data/Labelled_tweets_v1.csv")
raw_dataframe.head()

Unnamed: 0,Tweet,UserHandle,Party,Issue,Stance
0,a no of people approach me daily worried abt t...,ArvindKejriwal,AAP,GST,Disagreement
1,its now revealed that our fms silence on the p...,ArvindKejriwal,AAP,PNB Scam,Neutral
2,pnb scam started in is going on till today the...,ArvindKejriwal,AAP,PNB Scam,Neutral
3,would bjp confirm this if true what transpired...,ArvindKejriwal,AAP,PNB Scam,Neutral
4,bjp insiders telling that niravmodi been a reg...,ArvindKejriwal,AAP,PNB Scam,Neutral


In [3]:
# Get index on classes
stance_df = raw_dataframe.Stance
stances = np.unique(stance_df)
print(stances)
stance_idx = {}
for i in range(len(stances)):
    stance_idx[stances[i]] = i
print(stance_idx)
raw_dataframe.Stance.replace(to_replace = stance_idx, inplace = True)

issue_df = raw_dataframe['Issue']
issues = np.unique(issue_df)
print(issues)
issue_idx = {}
for i in range(len(issues)):
    issue_idx[issues[i]] = i
print(issue_idx)
raw_dataframe.Issue.replace(to_replace=issue_idx, inplace=True)
raw_dataframe.head(10)

['Agreement' 'Disagreement' 'Neutral']
{'Agreement': 0, 'Disagreement': 1, 'Neutral': 2}
['Aadhar linking' 'Beef Ban' 'Cauvery SC Verdict' 'Demonetisation'
 'EVM tampering' 'FDIPolicy' 'Fodder scam' 'GDP growth' 'GST'
 'Inflation control' 'Jallikattu ban' 'PNB Scam'
 'Padmavati film screening' 'Ram Mandir' 'RightToPrivacy SC Verdict'
 'Rohingyas' 'Swacch Bharat' 'Triple Talaq SC verdict' 'TripleTalaqBill'
 'acchedin' 'hike in oil prices' 'lgp price hike' 'nsc and ppf rate cuts'
 'reservation']
{'nsc and ppf rate cuts': 22, 'Triple Talaq SC verdict': 17, 'GDP growth': 7, 'Cauvery SC Verdict': 2, 'EVM tampering': 4, 'Demonetisation': 3, 'hike in oil prices': 20, 'FDIPolicy': 5, 'lgp price hike': 21, 'Swacch Bharat': 16, 'PNB Scam': 11, 'RightToPrivacy SC Verdict': 14, 'reservation': 23, 'Aadhar linking': 0, 'Padmavati film screening': 12, 'Ram Mandir': 13, 'Inflation control': 9, 'Beef Ban': 1, 'GST': 8, 'TripleTalaqBill': 18, 'Fodder scam': 6, 'Jallikattu ban': 10, 'Rohingyas': 15, 'acc

Unnamed: 0,Tweet,UserHandle,Party,Issue,Stance
0,a no of people approach me daily worried abt t...,ArvindKejriwal,AAP,8,1
1,its now revealed that our fms silence on the p...,ArvindKejriwal,AAP,11,2
2,pnb scam started in is going on till today the...,ArvindKejriwal,AAP,11,2
3,would bjp confirm this if true what transpired...,ArvindKejriwal,AAP,11,2
4,bjp insiders telling that niravmodi been a reg...,ArvindKejriwal,AAP,11,2
5,is it possible to believe that he or vijay mal...,ArvindKejriwal,AAP,11,2
6,someone told me today sealing bjp wants to rui...,ArvindKejriwal,AAP,5,2
7,if all state govts central govt and sc togethe...,ArvindKejriwal,AAP,5,2
8,three killings on merchants in one year first ...,ArvindKejriwal,AAP,8,1
9,there is no need to increase house tax in mcd ...,ArvindKejriwal,AAP,8,1


In [4]:
# Remove selected stopwords from the dataset
stop_words = ['the','of','in','and','a','is','on','this','all','it','will','for','to','be','with',
              'at','are','u','has','that','by','from', 'as','was','have','its','an','if','been','be','also','should','which']
for count, row in raw_dataframe.iterrows():
    tweet = row['Tweet']
    new_tweet = []
    for word in tweet.split():
        if word not in stop_words:
            new_tweet.append(word)
    raw_dataframe.iloc[count, 0] = ' '.join(new_tweet)

In [5]:
# Split dataset into train, test, validation
raw_dataframe = raw_dataframe.sample(frac=1).reset_index(drop=True)
X = raw_dataframe['Tweet'].values
Y1 = raw_dataframe['Issue'].values
Y2 = raw_dataframe['Stance'].values

num = Y1.shape[0]
pc_80 = int(0.8 * num)
pc_90 = int(0.9 * num)

# training dataset
X_train = X[:pc_80]
Y1_train = Y1[:pc_80]
Y2_train = Y2[:pc_80]
print(X_train.shape, Y1_train.shape, Y2_train.shape)


# validation dataset
X_val = X[pc_80:pc_90]
Y1_val = Y1[pc_80:pc_90]
Y2_val = Y2[pc_80:pc_90]
print(X_val.shape, Y1_val.shape, Y2_val.shape)

# Test dataset
X_test = X[pc_90:]
Y1_test = Y1[pc_90:]
Y2_test = Y2[pc_90:]
print(X_test.shape, Y1_test.shape, Y2_test.shape)



(5753,) (5753,) (5753,)
(719,) (719,) (719,)
(720,) (720,) (720,)


In [6]:
# Get tfidf vectors for train, val and test
tfidf_vec = TfidfVectorizer()
X_train_tf = tfidf_vec.fit_transform(X_train).toarray()
print(X_train_tf.shape)

X_val_tf = tfidf_vec.transform(X_val).toarray()
print(X_val_tf.shape)

X_test_tf = tfidf_vec.transform(X_test).toarray()
print(X_test_tf.shape)

(5753, 15103)
(719, 15103)
(720, 15103)


In [7]:
# Apply Gaussian Naive Bayes directly for test dataset
from sklearn.naive_bayes import GaussianNB
gnb1 = GaussianNB()
y1_pred = gnb1.fit(X_train_tf, Y1_train).predict(X_test_tf)

gnb2 = GaussianNB()
y2_pred = gnb2.fit(X_train_tf, Y2_train).predict(X_test_tf)

print('Issue classification')
print('Accuracy: ', accuracy_score(Y1_test, y1_pred))
print('Weighted F1 score: ', f1_score(Y1_test, y1_pred, average='weighted'))

print('Stance classification')
print('Accuracy: ', accuracy_score(Y2_test, y2_pred))
print('Weighted F1 score: ', f1_score(Y2_test, y2_pred, average='weighted'))


Issue classification
Accuracy:  0.5583333333333333
Weighted F1 score:  0.5296565213662253
Stance classification
Accuracy:  0.625
Weighted F1 score:  0.6279284417660259


  'precision', 'predicted', average, warn_for)


In [10]:
# Apply Multinomial Naive Bayes directly for test dataset
from sklearn.naive_bayes import MultinomialNB
mnb1 = MultinomialNB()
y1_pred = mnb1.fit(X_train_tf, Y1_train).predict(X_test_tf)

mnb2 = MultinomialNB()
y2_pred = mnb2.fit(X_train_tf, Y2_train).predict(X_test_tf)

print('Issue classification')
print('Accuracy: ', accuracy_score(Y1_test, y1_pred))
print('Weighted F1 score: ', f1_score(Y1_test, y1_pred, average='weighted'))

print('Stance classification')
print('Accuracy: ', accuracy_score(Y2_test, y2_pred))
print('Weighted F1 score: ', f1_score(Y2_test, y2_pred, average='weighted'))

Issue classification
Accuracy:  0.44305555555555554
Weighted F1 score:  0.332278575770555
Stance classification
Accuracy:  0.6930555555555555
Weighted F1 score:  0.6410084003257671


  'precision', 'predicted', average, warn_for)


In [9]:
# Classify using majority vote in training dataset
issue_counts = np.unique(Y1_train, return_counts=True)
max_issue = max(issue_counts[1])
print(np.where(issue_counts[1]==max_issue))

stance_counts = np.unique(Y2_train, return_counts=True)
max_stance =  max(stance_counts[1])
print(np.where(stance_counts[1]==max_stance))

y1_pred = np.array([8]*len(Y1_test))
y2_pred = np.array([0]*len(Y2_test))

print('Issue classification')
print('Accuracy: ', accuracy_score(Y1_test, y1_pred))
print('Weighted F1 score: ', f1_score(Y1_test, y1_pred, average='weighted'))

print('Stance classification')
print('Accuracy: ', accuracy_score(Y2_test, y2_pred))
print('Weighted F1 score: ', f1_score(Y2_test, y2_pred, average='weighted'))

(array([8]),)
(array([0]),)
Issue classification
Accuracy:  0.3472222222222222
Weighted F1 score:  0.17898052691867122
Stance classification
Accuracy:  0.46111111111111114
Weighted F1 score:  0.2910435149978876


  'precision', 'predicted', average, warn_for)


In [10]:
# Classifiy using SVM classifier
from sklearn.svm import SVC
clf1 = SVC(gamma='auto', kernel='linear')
clf1.fit(X_train_tf, Y1_train)
y1_pred = clf1.predict(X_test_tf)

clf2 = SVC(gamma='auto', kernel='linear')
clf2.fit(X_train_tf, Y2_train)
y2_pred = clf2.predict(X_test_tf)
print('Issue classification')
print('Accuracy: ', accuracy_score(Y1_test, y1_pred))
print('Weighted F1 score: ', f1_score(Y1_test, y1_pred, average='weighted'))

print('Stance classification')
print('Accuracy: ', accuracy_score(Y2_test, y2_pred))
print('Weighted F1 score: ', f1_score(Y2_test, y2_pred, average='weighted'))

Issue classification
Accuracy:  0.9541666666666667
Weighted F1 score:  0.9530755966329344
Stance classification
Accuracy:  0.7416666666666667
Weighted F1 score:  0.7324225317398608


In [9]:
# Classify using Logistic regression
from sklearn.linear_model import LogisticRegression
log1 = LogisticRegression()
y1_pred = log1.fit(X_train_tf, Y1_train).predict(X_test_tf)

log2 = LogisticRegression()
y2_pred = log2.fit(X_train_tf, Y2_train).predict(X_test_tf)

print('Issue classification')
print('Accuracy: ', accuracy_score(Y1_test, y1_pred))
print('Weighted F1 score: ', f1_score(Y1_test, y1_pred, average='weighted'))

print('Stance classification')
print('Accuracy: ', accuracy_score(Y2_test, y2_pred))
print('Weighted F1 score: ', f1_score(Y2_test, y2_pred, average='weighted'))

Issue classification
Accuracy:  0.8875
Weighted F1 score:  0.8797122502523665
Stance classification
Accuracy:  0.7263888888888889
Weighted F1 score:  0.7023813743644104


  'precision', 'predicted', average, warn_for)
