In [1]:
import os
import sys

# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/gispy')

import pandas as pd
from scipy.stats import zscore
from gist import GIS, GIST

In [2]:
df = GIST(docs_path='../data/documents', config_path='../gispy/gist_config.json').compute_scores()

loading parameters and models...
reading input text files...
------------------------------
number of documents: 50
document batch size: 10
document(s) in each batch: 5
------------------------------
processing batch #1
processing batch #2
processing batch #3
processing batch #4
processing batch #5
processing batch #6
processing batch #7
processing batch #8
processing batch #9
processing batch #10
normalizing values of indices...
computing the final GIS...
computing GIS for all documents is done. results are saved at /results.csv


In [5]:
df = pd.read_csv('results.csv')

In [6]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,d_id,text,gis,gis_zscore
0,0,m_15.txt,The purpose of the current study was to unders...,-1.245031,-2.870977
1,1,d_19.txt,Early reports have shown a relationship betwee...,-0.951089,-1.706171
2,2,d_0.txt,A third feature of our data that may constrain...,-0.57202,0.411909


In [7]:
# comparing the GIS of Discussion vs. Methods sections of a collection of documents
# Discussion sections are supposed to be more gispty and have a higher GIS value.

d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis'])
    else:
        m_scores.append(row['gis'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: -0.5528727796894336
avg Methods score: -0.6167237579259587


In [4]:
d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis_zscore'])
    else:
        m_scores.append(row['gis_zscore'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: 0.9173776526499102
avg Methods score: -0.9173776526499157


In [5]:
d_scores

[2.481513262979047,
 4.045891841853867,
 4.084544637824087,
 2.5325762776914225,
 -3.8993600301014633,
 -1.8116966538787387,
 2.3924491203188065,
 1.9895099760060586,
 4.553517527041633,
 4.3103540331593155,
 -1.5235501274704293,
 0.40492380802422745,
 -2.5700330390352755,
 4.2204796609477215,
 -2.7216773055791172,
 0.3749046173597872,
 -2.9547439779045535,
 4.175170218331477,
 1.1317371918190169,
 -2.3338092563758384,
 -1.0720484696696242,
 2.910735633892791,
 4.5687102067871415,
 -4.690485584248583,
 2.3348277464749847]

In [6]:
m_scores

[-4.1653659697933065,
 -4.487997847419378,
 6.810410202145173,
 0.6472453038310696,
 4.741292829842804,
 5.693929204925222,
 -6.40391523398347,
 -3.0440334569281937,
 -0.3906367277858034,
 -0.6469670153861461,
 -0.9793255659549243,
 -0.5609932768849368,
 -6.806474250578566,
 -1.6381566615506884,
 -2.4952855107105236,
 -1.7898208250165994,
 1.7467713815999613,
 -4.8657395094238725,
 0.21641606075087405,
 0.5862738665280044,
 2.8710377003823613,
 -3.166934448543712,
 2.7556083428859903,
 -3.991949070210797,
 -3.5698308389684286]

### Computing GIS using pre-computed Coh-Metrix indexes

In [4]:
df = pd.read_csv("../data/mturk_all.csv")

In [5]:
a = GIS().score(df, wolfe=True)

In [6]:
a.head(5)

Unnamed: 0,TextID,DESPC,DESSC,DESWC,DESPL,DESPLd,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,zSMCAUSlsa,zSMCAUSwn,zWRDIMGc,zWRDHYPnv,GIS
0,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,8,14,355,1.75,1.389,25.5,14.325,1.614,0.924,...,1.842,2.275,44.553001,13.344,6.526,-0.975,-0.229166,1.242778,1.661539,-4.25215
1,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,11,25,425,2.273,1.191,17.24,10.978,1.532,0.827,...,1.693,2.405,59.973,9.118,12.206,-0.675,0.8125,0.817916,2.161538,-6.053954
2,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,4,17,313,4.25,2.63,18.471001,9.677,1.431,0.826,...,1.817,1.643,67.084,8.476,21.061001,-0.15,-0.208333,-1.270545,-0.769231,4.363109
3,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,18,26,744,1.444,0.616,29.0,11.631,1.777,1.03,...,1.643,2.383,27.457001,16.538,8.316,-1.375,-0.604167,0.801352,2.076923,-5.268108
4,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,5,16,272,3.2,1.789,17.25,9.936,1.346,0.702,...,1.467,1.603,75.708,6.923,20.931,-1.45,0.15625,-0.696967,-0.923077,1.551794


In [5]:
a.to_csv('mturk_all.csv')