In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.metrics.pairwise import manhattan_distances
from scipy import spatial

data=pd.read_json('./../data/essay_corpus.json')
train_test_id=pd.read_csv('./../data/train-test-split.csv',sep=";")
train_id = train_test_id[train_test_id.SET == 'TRAIN'].index
test_id = train_test_id[train_test_id.SET == 'TEST'].index
train=data.loc[data['id'].isin(train_id+1)]
test=data.loc[data['id'].isin(test_id+1)]
train.to_json('./../data/train_essay.json',orient = 'records')
test.to_json('./../data/test_essay.json',orient = 'records')

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [3]:
import pandas as pd

from textblob import TextBlob


def getDF(input_json):
    df_output = pd.DataFrame(columns=['Essay_id',
                                      'majAndclaimSimilarity',
                                      'minClaimSimilarity',
                                      'PremiseSent_Polarity', 
                                      'weightedSimilarity',
                                      'on_the_other_hand',
                                      'however',
                                      'but', 
                                      'nor', 
                                      'not_only',
                                      'moreover',
                                      'nevertheless',
                                      'though',
                                      'yet',
                                      'either',
                                      'therefore',
                                      'consequently',
                                      'admittedly',
                                      'argue_that',
                                      'ACTUAL_CONF_BIAS'])
    for row in input_json.iterrows():
        overallpolarity = 0
        on_the_other_hand = False
        however = False
        but = False
        nor = False
        not_only = False
        moreover = False
        nevertheless = False
        though = False
        yet = False
        either = False
        therefore = False
        consequently = False
        admittedly = False
        argue_that = False
        normalizingCount = 0
        found = False
        for premise in row[1]['premises']:
            blob = TextBlob(premise['text'].lower())
            for sentence in blob.sentences:
                if(sentence.sentiment.polarity!=0):
                    normalizingCount = normalizingCount + 1
                    overallpolarity =  overallpolarity + (sentence.sentiment[0])
        
        overallpolarity = overallpolarity/normalizingCount
    
        majorClaimText = ""
        for majorClaim in row[1]['major_claim']:
            majorClaimText = majorClaimText +" " + majorClaim['text'].lower()
        
        claimTextList = list()
        claimText = ""
        for claim in row[1]['claims']:
            claimTextList.append(claim['text'].lower())
            claimText = claimText + " "+ claim['text'].lower()
        
        embeddings = embed(claimTextList)
        
        corr = np.inner(embeddings,embeddings)
        minClaimSimilarity = np.min(corr)
        
        embeddings = embed([majorClaimText,claimText])

        vA = embeddings[0]
        vB = embeddings[1]

        A=np.array(vA)
        B=np.array(vB)

        majAndclaimSimilarity = 1 - spatial.distance.cosine(A, B)
    
        weightedSimilarity = (minClaimSimilarity) / (100*overallpolarity)
            
        for paragraph in row[1]['paragraphs']:
            if(paragraph['text'].lower().count('on the other hand')>0):
                on_the_other_hand = True
            if(paragraph['text'].lower().count('however')>0):
                however = True
            if(paragraph['text'].lower().count('but')>0):
                but = True
            if(paragraph['text'].lower().count('nor')>0):
                nor = True
            if(paragraph['text'].lower().count('not only')>0):
                not_only = True
            if(paragraph['text'].lower().count('moreover')>0):
                moreover = True
            if(paragraph['text'].lower().count('nevertheless')>0):
                nevertheless = True
            if(paragraph['text'].lower().count('though')>0):
                though = True
            if(paragraph['text'].lower().count('either')>0):
                either = True
            if(paragraph['text'].lower().count('yet')>0):
                yet = True
            if(paragraph['text'].lower().count('therefore')>0):
                therefore = True
            if(paragraph['text'].lower().count('consequently')>0):
                consequently = True
            if(paragraph['text'].lower().count('admittedly')>0):
                admittedly = True
            if(paragraph['text'].lower().count('argue that')>0):
                argue_that = True
            

        data = {'Essay_id':row[1]['id'],
                'majAndclaimSimilarity': majAndclaimSimilarity,
                'minClaimSimilarity': minClaimSimilarity,
                'PremiseSent_Polarity':overallpolarity,
                'weightedSimilarity': weightedSimilarity,
                'on_the_other_hand': on_the_other_hand,
                'however': however,
                'but': but,
                'nor': nor,
                'not_only': not_only,
                'moreover': moreover,
                'nevertheless': nevertheless,
                'though': though,
                'either': either,
                'yet': yet,
                'therefore': therefore,
                'consequently': consequently,
                'admittedly': admittedly,
                'argue_that': argue_that,
                'ACTUAL_CONF_BIAS': row[1]['confirmation_bias']}
        df_output = df_output.append(data, ignore_index=True)

    return df_output


trainFile = './../data/train_essay.json';
testFile = './../data/test_essay.json'
jsonFileInput = pd.read_json(trainFile, 'r')
trainDF = getDF(jsonFileInput)
jsonFileInput = pd.read_json(testFile, 'r')
testDF = getDF(jsonFileInput)

trainDF.to_csv('./../data/raw_polarity_train.txt', sep=',', encoding='utf-8',index=False,header=True)
testDF.to_csv('./../data/raw_polarity_test.txt', sep=',', encoding='utf-8',index=False,header=True)

# Feature table for train data

In [4]:
trainDF.head()

Unnamed: 0,Essay_id,majAndclaimSimilarity,minClaimSimilarity,PremiseSent_Polarity,weightedSimilarity,on_the_other_hand,however,but,nor,not_only,moreover,nevertheless,though,yet,either,therefore,consequently,admittedly,argue_that,ACTUAL_CONF_BIAS
0,365,0.337829,0.034408,0.275694,0.001248,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False
1,134,0.258606,0.110308,0.181019,0.006094,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True
2,131,0.564156,0.05606,0.094975,0.005903,False,True,False,False,False,False,False,True,False,False,False,False,True,False,True
3,198,0.565267,0.021634,0.12037,0.001797,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
4,330,0.640624,0.245322,-0.011905,-0.20607,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False


# Feature table for test data

In [5]:
testDF.head()

Unnamed: 0,Essay_id,majAndclaimSimilarity,minClaimSimilarity,PremiseSent_Polarity,weightedSimilarity,on_the_other_hand,however,but,nor,not_only,moreover,nevertheless,though,yet,either,therefore,consequently,admittedly,argue_that,ACTUAL_CONF_BIAS
0,373,0.532058,0.176427,0.074451,0.023697,False,False,True,True,False,True,True,False,False,True,False,False,False,False,False
1,61,0.448137,0.151546,-0.080981,-0.018714,False,True,True,False,False,True,False,False,False,False,False,False,False,False,True
2,180,0.655844,0.1483,0.105258,0.014089,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False
3,211,0.320307,0.169262,0.120438,0.014054,False,True,True,False,True,True,False,False,False,False,False,False,False,True,True
4,229,0.721168,0.010891,0.341667,0.000319,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.model_selection import train_test_split

# Train Data:

In [7]:
# Dropped essay id as not needed for training and prepares the label and data of test set
X_train=trainDF.iloc[:,:-1]
X_train=X_train.drop(columns=['Essay_id'])
y_train=np.array(trainDF.iloc[:,-1:]).astype(int)

# X_Train input for the SVM

In [8]:
X_train.head()

Unnamed: 0,majAndclaimSimilarity,minClaimSimilarity,PremiseSent_Polarity,weightedSimilarity,on_the_other_hand,however,but,nor,not_only,moreover,nevertheless,though,yet,either,therefore,consequently,admittedly,argue_that
0,0.337829,0.034408,0.275694,0.001248,False,False,True,True,False,False,False,False,False,False,False,False,False,False
1,0.258606,0.110308,0.181019,0.006094,True,False,True,False,False,False,False,False,False,False,False,False,False,False
2,0.564156,0.05606,0.094975,0.005903,False,True,False,False,False,False,False,True,False,False,False,False,True,False
3,0.565267,0.021634,0.12037,0.001797,False,False,True,False,False,False,False,False,False,False,True,False,False,False
4,0.640624,0.245322,-0.011905,-0.20607,False,True,True,False,False,False,False,True,False,False,False,False,False,False


# Y_train input for the SVM

In [9]:
# 0 indicates conf_bias false, 1 indicates conf_bias true
y_train[0:5]

array([[0],
       [1],
       [1],
       [0],
       [0]])

# Test data:

In [10]:
# Dropped essay id as not needed for testing and prepares the label and data of test set
X_test=testDF.iloc[:,:-1]
X_test=X_test.drop(columns=['Essay_id'])
y_test=np.array(testDF.iloc[:,-1:]).astype(int)

# X_test of SVM

In [11]:
X_test.head()

Unnamed: 0,majAndclaimSimilarity,minClaimSimilarity,PremiseSent_Polarity,weightedSimilarity,on_the_other_hand,however,but,nor,not_only,moreover,nevertheless,though,yet,either,therefore,consequently,admittedly,argue_that
0,0.532058,0.176427,0.074451,0.023697,False,False,True,True,False,True,True,False,False,True,False,False,False,False
1,0.448137,0.151546,-0.080981,-0.018714,False,True,True,False,False,True,False,False,False,False,False,False,False,False
2,0.655844,0.1483,0.105258,0.014089,False,False,True,False,True,False,False,False,False,False,False,False,False,False
3,0.320307,0.169262,0.120438,0.014054,False,True,True,False,True,True,False,False,False,False,False,False,False,True
4,0.721168,0.010891,0.341667,0.000319,False,True,True,False,False,False,False,False,False,False,False,False,False,False


# Test data: Y_test of SVM

In [12]:
# 0 indicates conf_bias false, 1 indicates conf_bias true
y_test[0:5]

array([[0],
       [1],
       [0],
       [1],
       [0]])

# SVM

In [13]:
model = svm.SVC(kernel="rbf",C= 1000, gamma=1)

In [14]:
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [15]:
training_score=model.fit(X_train, y_train).score(X_train, y_train)
print(training_score)

0.9813664596273292


  y = column_or_1d(y, warn=True)


In [16]:
predicted_test = model.predict(X_test)
predicted_train = model.predict(X_train)

In [17]:
predDF = pd.DataFrame(columns=['id','confirmation_bias'])
predDF['id'] = testDF['Essay_id']
predDF['confirmation_bias'] = predicted_test

# The output of the predictions is written to "predictions.json" file

In [18]:
with open('./../data/predictions.json', 'w', encoding='utf-8') as file:
    predDF.to_json(file, force_ascii=False,orient='records')

In [19]:
from sklearn.metrics import classification_report
target_class= ['FALSE','TRUE']
print(classification_report(y_test, predicted_test, target_names=target_class))

              precision    recall  f1-score   support

       FALSE       0.84      0.82      0.83        51
        TRUE       0.70      0.72      0.71        29

    accuracy                           0.79        80
   macro avg       0.77      0.77      0.77        80
weighted avg       0.79      0.79      0.79        80



# Grid search SVM (Hyperparameter-tuning)

In [20]:
# Uses the combinations of the below parameters for our SVM model tuning
from sklearn.model_selection import GridSearchCV
param_grid = {
    'kernel': ['linear','rbf', 'poly'],
    'C': [1,10,100,1000,10000],
    'gamma': [1,1e-1,1e-2,1e-3, 1e-4],
}

In [21]:
# Outputs the best training score achievable after trying model parameter combinations listed above
grid_model = GridSearchCV(estimator = model, param_grid = param_grid,n_jobs = -1,scoring="f1_micro")
grid_model.fit(X_train, y_train)
grid_scores = grid_model.best_score_
print(grid_scores)



0.7639751552795031


  y = column_or_1d(y, warn=True)


In [22]:
# Outputs the best parameter with which SVM model can be trained to max score in previous cell 
print(grid_model.best_params_)

{'C': 100, 'gamma': 1, 'kernel': 'linear'}
