In [1]:
import spacy as sp
import re
import pandas as pd
nlp = sp.load("en_core_web_sm")
df_json = pd.read_json('./comp_arg_hackers.json','r')

In [2]:
countTokens=0
countSentence = 0
countEssays=0
for essay in df_json['text']:
    stringList=essay.split('\n')
    tokens = nlp(' '.join(stringList[1:]))
    countTokens += len(tokens)
    sentences = list(tokens.sents)
    countSentence += len(sentences)
    countEssays += 1
    
countPara=0
for para in df_json['paragraphs']:
    countPara += len(para)    

##  Task 1: count of sentences, paragraphs, essays and tokens among all the essays

In [3]:
print('number of essays: ', countEssays)
print('number of tokens: ', countTokens)
print('number of sentences: ', countSentence)
print('number of paragraphs: ', countPara)

number of essays:  322
number of tokens:  115883
number of sentences:  5464
number of paragraphs:  820


## Task 2: Number of major claims, claims, premises.

In [4]:
countMajor=0
countClaims = 0
countPremise = 0
for majorClaim, claim, premise in zip(df_json['major_claim'],df_json['claims'],df_json['premises']):
    countMajor += len(majorClaim)
    countClaims += len(claim)
    countPremise += len(premise)

In [5]:
print('number of major claims in all essays: ',countMajor)
print('number of claims in all essays: ', countClaims)
print('number of premises in all essays: ', countPremise)

number of major claims in all essays:  598
number of claims in all essays:  1202
number of premises in all essays:  3023


## Task 3: Number of essays with confirmation bias and no confirmation bias

In [6]:
print('number of essays with confirmation bias: ', len(df_json[df_json.confirmation_bias == True]))
print('number of essays with no confirmation bias: ', len(df_json[df_json.confirmation_bias == False]))

number of essays with confirmation bias:  122
number of essays with no confirmation bias:  200


## Task 4: Number of sufficient and insufficient paragraphs

In [7]:
countSuff =0
countNotSuff = 0
for paraList in df_json['paragraphs']:
    for para in paraList:
        if para['sufficient'] == True:
            countSuff += 1
        else: countNotSuff += 1


print('number of sufficient paragraphs: ', countSuff)
print('number of insufficient paragraphs: ', countNotSuff)

number of sufficient paragraphs:  538
number of insufficient paragraphs:  282


In [8]:
## this will remove stopwords and spaces and lemmatise the words. 
def preProcess(text):
    tokens = nlp(text)
    processedList = list()
    for token in tokens:
        if (not token.is_punct and  
                not token.is_space and  
                not token.is_stop):
            
            processedList.append(token.lemma_)
    return len(tokens), processedList


In [9]:
countTokensMajor = 0
countTokensClaim=0
countTokensPremise=0
majClaimText = ""
claimText = ""
premiseText = ""


for majorclaims, claims, premises in zip(df_json['major_claim'],df_json['claims'],df_json['premises']):
    for major in majorclaims:
        majClaimText = major['text'] + " " + majClaimText
    
    for claim in claims:
        claimText = claim['text'] + " " + claimText
    
    for premise in premises:
        premiseText = premise['text'] + " " + premiseText
    
countTokensMajor, majorClaimLemmaList = preProcess(majClaimText.strip())
countTokensClaim, claimLemmaList = preProcess(claimText.strip())
countTokensPremise, premiseLemmaList = preProcess(premiseText.strip())

### Task 5: Average number of tokens in major claims, claims, and premises.

In [10]:
print ('total tokens in major claims: ', countTokensMajor)        
print ('average tokens in major claims: ', countTokensMajor/countMajor)

print ('total tokens in claims: ', countTokensClaim)        
print ('average tokens in claims: ', countTokensClaim/countClaims)

print ('total tokens in premises: ', countTokensPremise)        
print ('average tokens in premises: ', countTokensPremise/countPremise)

majClaimLemmaJoined = " ".join(majorClaimLemmaList)
claimLemmaJoined = " ".join(claimLemmaList)
premiseLemmaJoined = " ".join(premiseLemmaList)

total tokens in major claims:  8788
average tokens in major claims:  14.695652173913043
total tokens in claims:  18139
average tokens in claims:  15.090682196339435
total tokens in premises:  53211
average tokens in premises:  17.60205094277208


## Task 6: specific words

#### Use the definition of TFIDF

In [11]:
import math
import operator

## combined List of docs
docs = [majClaimLemmaJoined, claimLemmaJoined, premiseLemmaJoined]

## term frequency word
def termFreq(text):
    countDict = dict()
    tokens = text.split(' ')
    for word in tokens:
        if word in countDict:
            countDict[word] += 1
        else:
            countDict[word] = 1

    return countDict
    
## inverse Document
def inversedf(word):
    
    count=0
    for doc in docs:
        if word in doc:
            count +=1
        
    
    return math.log(3/count)
    
def tfidf(docFor):
    freqDict = termFreq(docFor)
    dic = dict()
    for word,count in freqDict.items():
        dic[word] = inversedf(word) * count
    
    return dic
    
    
    
majorDic = tfidf(majClaimLemmaJoined)

sortedMajorDict = dict(sorted(majorDic.items(), key=operator.itemgetter(1),reverse=True))

claimDic = tfidf(claimLemmaJoined)
sortedClaimDict = dict(sorted(claimDic.items(), key=operator.itemgetter(1),reverse=True))

premiseDic = tfidf(premiseLemmaJoined)
sortedPremiseDict = dict(sorted(premiseDic.items(), key=operator.itemgetter(1),reverse=True))


In [12]:
## collecting specific words For major Claims which are not in others
tfListMajor =list()
tfListMajor_word = list()
for word,valueMajor in sortedMajorDict.items():
    
    if word in sortedClaimDict:
        valueClaim = sortedClaimDict[word]
        if valueClaim >= valueMajor:
            continue
    elif word in sortedPremiseDict:
        valuePremise = sortedPremiseDict[word]
        if valuePremise >= valueMajor:
            continue
        
    tfListMajor.append([word,valueMajor])
    tfListMajor_word.append(word)

    
## collecting specific words For Premise which are not in others
tfListPremise =list()
tfListPremise_word = list()
for word,valuePremise in sortedPremiseDict.items():
    
    if word in sortedClaimDict:
        valueClaim = sortedClaimDict[word]
        if valueClaim >= valuePremise:
            continue
    elif word in sortedMajorDict:
        valueMajor = sortedMajorDict[word]
        if valueMajor >= valuePremise:
            continue
        
    tfListPremise.append([word,valuePremise])
    tfListPremise_word.append(word)

## collecting specific words For Claims which are not in others
tfListClaim =list()
tfListClaim_word= list()
for word,valueClaim in sortedClaimDict.items():

    ##compare against major
    if word in sortedMajorDict:
        valueMajor = sortedMajorDict[word]
        if valueMajor >= valueClaim:
            continue
            
    ## compare against premises
    elif word in sortedPremiseDict:
        valuePremise = sortedPremiseDict[word]
        if valuePremise >= valueClaim:
            continue
        
    tfListClaim.append([word,valueClaim])
    tfListClaim_word.append(word)

### Following are the most specific words

In [13]:
list1 = [tfListMajor_word[:10], tfListClaim_word[:10], tfListPremise_word[:10]]
pd.DataFrame(list(zip(tfListMajor_word[:10],tfListPremise_word[:10],tfListClaim_word[:10])), columns = ['majorClaim', 'Premise','Claim'])

Unnamed: 0,majorClaim,Premise,Claim
0,disagree,water,effectiveness
1,scale,assignment,convict
2,allot,local,enable
3,compulsory,send,intellectual
4,fatherhood,favorite,means
5,socioeconomic,crime,oppose
6,kidvid,report,broaden
7,entirely,painting,dynamic
8,maximum,pressure,passionate
9,overemphasize,disease,mirror
