In [1]:
import os
import sys

# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/gispy')

import pandas as pd
from scipy.stats import zscore
from gist import GIS, GIST

In [2]:
df = GIST(docs_path='../data/documents').compute_scores()

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# comparing the GIS of Discussion vs. Methods sections of a collection of documents
# Discussion sections are supposed to be more gispty and have a higher GIS value.

d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis'])
    else:
        m_scores.append(row['gis'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: -0.42077650534042677
avg Methods score: -0.8962201397460875


In [4]:
d_scores

[-0.06058802651077155,
 0.29735865074149975,
 0.3101267764204952,
 -0.03684963736757654,
 -1.3527090187408657,
 -1.108048844740789,
 0.04677100922477614,
 -0.20589683947924864,
 0.3822816512695802,
 0.4022476035161172,
 -0.932002313005785,
 -0.5411744808468777,
 -1.256969604203269,
 0.36445196273805797,
 -1.3285238408902789,
 -0.4905923084156008,
 -1.3799398217231242,
 0.3848307750148894,
 -0.4674665661009897,
 -1.3094462188192864,
 -0.8788335909801227,
 0.0963860277046385,
 0.4153981251718588,
 -1.693152944837753,
 -0.17707115865024492]

In [5]:
m_scores

[-1.6661935332979003,
 -1.5443487647006469,
 0.6155688406752425,
 -0.46650483520646524,
 0.3714363115557503,
 0.4632686534239081,
 -2.0612465952992007,
 -1.3947090853496404,
 -0.5852376654674096,
 -0.8816533801245402,
 -1.0410119355133314,
 -0.7848731762916936,
 -2.0735825879615186,
 -0.963556613109763,
 -1.172114176588038,
 -1.086528240975115,
 -0.3128915326877023,
 -1.7319259928699524,
 -0.8556515238047698,
 -0.5877697497922018,
 -0.13447988593955507,
 -1.2878481985176673,
 -0.03869155108741815,
 -1.6301100855547215,
 -1.5548481891678363]

### Computing GIS using pre-computed Coh-Metrix indexes

In [4]:
df = pd.read_csv("../data/mturk_all.csv")

In [5]:
a = GIS().score(df, wolfe=True)

In [6]:
a.head(5)

Unnamed: 0,TextID,DESPC,DESSC,DESWC,DESPL,DESPLd,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,zSMCAUSlsa,zSMCAUSwn,zWRDIMGc,zWRDHYPnv,GIS
0,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,8,14,355,1.75,1.389,25.5,14.325,1.614,0.924,...,1.842,2.275,44.553001,13.344,6.526,-0.975,-0.229166,1.242778,1.661539,-4.25215
1,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,11,25,425,2.273,1.191,17.24,10.978,1.532,0.827,...,1.693,2.405,59.973,9.118,12.206,-0.675,0.8125,0.817916,2.161538,-6.053954
2,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,4,17,313,4.25,2.63,18.471001,9.677,1.431,0.826,...,1.817,1.643,67.084,8.476,21.061001,-0.15,-0.208333,-1.270545,-0.769231,4.363109
3,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,18,26,744,1.444,0.616,29.0,11.631,1.777,1.03,...,1.643,2.383,27.457001,16.538,8.316,-1.375,-0.604167,0.801352,2.076923,-5.268108
4,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,5,16,272,3.2,1.789,17.25,9.936,1.346,0.702,...,1.467,1.603,75.708,6.923,20.931,-1.45,0.15625,-0.696967,-0.923077,1.551794


In [5]:
a.to_csv('mturk_all.csv')