In [None]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir



In [None]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!ls $CORENLP_HOME

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize', 'ssplit', 'pos', 'lemma'],
    memory='1000T',
    timeout=1e32,
    endpoint='http://localhost:4015',
    max_char_length=1e24,
    thread=100,
    be_quiet=True)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

In [None]:
# Print background processes and look for java
# You should be able to see a StanfordCoreNLPServer java process running in the background
!ps -o pid,cmd | grep java

In [None]:
from google.colab import drive
import csv
drive.mount('/content/drive')
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
filenames = []
f = open('/content/drive/MyDrive/Colab Notebooks/term project/ted_info.txt')
for line in f.readlines():
    filenames.append(line[0:len(line)-1])
f.close
print(filenames)

['AaronHuey_2010X.sph', 'AdamGrosser_2007.sph', 'AdamSadowsky_2010X.sph', 'AdamSavage_2008P.sph', 'AditiShankardass_2009I.sph', 'AdoraSvitak_2010.sph', 'AimeeMullins_1998.sph', 'AimeeMullins_2009U.sph', 'AJJacobs_2007P.sph', 'AlaindeBotton_2009G.sph', 'AlanKay_2007.sph', 'AlanRussell_2006.sph', 'AlanSiegel_2010.sph', 'AlexisOhanian_2009I.sph', 'AlexTabarrok_2009.sph', 'AlGore_2006.sph', 'AlGore_2008.sph', 'AliCarrChellman_2010X.sph', 'AlisonJackson_2005G.sph', 'AllisonHunt_2007.sph', 'AlSeckel_2004.sph', 'AndersYnnerman_2010X.sph', 'AndrewMwenda_2007G.sph', 'AnilGupta_2009I.sph', 'AnnaDeavereSmith_2005.sph', 'AnthonyAtala_2009P.sph', 'AnupamMishra_2009I.sph', 'AriannaHuffington_2010W.sph', 'ArthurBenjamin_2005.sph', 'ArthurBenjamin_2009.sph', 'ArthurGanson_2004.sph', 'ArthurPottsDawson_2010G.sph', 'AsherHasan_2009I.sph', 'AshrafGhani_2005G.sph', 'AubreydeGrey_2005G.sph', 'AuretvanHeerden_2010G.sph', 'BarbaraBlock_2010Z.sph', 'BarrySchuler_2008P.sph', 'BarrySchwartz_2009.sph', 'BarrySch

In [None]:
old_data=[]
feature_list=[]
with open('/content/drive/MyDrive/Colab Notebooks/term project/after_syanalysis.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    feature_list=next(csv_reader)
    for row in csv_reader:
        old_data.append(row)
#add_feature=['mean_Yngve_depth','total_Yngve_depth','max_Yngve_depth','mean_left_branching','total_left_branching','max_left_branching']
add_feature=['mean_frazer_depth']
for i in range(0,len(add_feature)):
  feature_list.append(add_feature[i])
print(feature_list)

['name', 'words_number', 'SPACE', 'ADV', 'VERB', 'ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT', 'INTJ', 'NUM', 'PRON', 'AUX', 'CCONJ', 'PART', 'PROPN', 'SCONJ', 'CONJ', 'Punctuation', 'hestitation_word', 'lemma_number', 'most_frequent', 'noun_chunk', 'person_singular_verbs', 'misspell', 'time_spec', 'spec', 'sentence', 'neg_word', 'content', 'function', 'SPACE_R', 'ADV_R', 'VERB_R', 'ADP_R', 'DET_R', 'NOUN_R', 'ADJ_R', 'PUNCT_R', 'INTJ_R', 'NUM_R', 'PRON_R', 'AUX_R', 'CCONJ_R', 'PART_R', 'PROPN_R', 'SCONJ_R', 'CONJ_R', 'Punctuation_R', 'hestitation_word_R', 'lemma_number_R', 'person_singular_verbs_R', 'misspell_R', 'time_spec_R', 'spec_R', 'neg_word_R', 'content_R', 'function_R', 'time_split', 'wordnum_t', 'SPACE_t', 'ADV_t', 'VERB_t', 'ADP_t', 'DET_t', 'NOUN_t', 'ADJ_t', 'PUNCT_t', 'INTJ_t', 'NUM_t', 'PRON_t', 'AUX_t', 'CCONJ_t', 'PART_t', 'PROPN_t', 'SCONJ_t', 'CONJ_t', 'Punctuation_t', 'hestitation_word_t', 'lemma_number_t', 'most_frequent_t', 'noun_chunk_t', 'person_singular_verbs_t', 'miss

In [None]:
import nltk
from nltk.tree import Tree
def syntactic_analysis(text):
  sentences = nltk.sent_tokenize(text)
  max_frazer_depth=-1
  frazer_chunk=0
  frazer_depth=[]
  for sentence in sentences:
    if(len(sentence)>200):
      continue
    try:
      result = list(parser.parse(sentence.split()))
      tree = result[0]
      def calculate_frazer_depth_for_sentence(tree):
          frazer_depths = {}
          def calculate_frazer_depth_for_word(node, word, depth=0):
            total_younger_sisters = 0
            def count_younger_sisters(node):
                nonlocal total_younger_sisters
                for i, child in enumerate(node):
                    if isinstance(child, Tree):
                        total_younger_sisters += len(node) - 1 - i
                        count_younger_sisters(child)
            count_younger_sisters(node)
            return total_younger_sisters
          for subtree in tree.subtrees():
            if isinstance(subtree, Tree):
              word = " ".join(subtree.leaves())
              frazer_depth = calculate_frazer_depth_for_word(subtree, word)
              frazer_depths[word] = frazer_depth
          return frazer_depths
      frazer_depths = calculate_frazer_depth_for_sentence(tree)
      all_depth=0
      temp_chunk=0
      for word, depth in frazer_depths.items():
        frazer_chunk+=1
        temp_chunk+=1
        all_depth+=depth
        if(depth > max_frazer_depth):
          max_frazer_depth=depth
      frazer_depth.append(all_depth/temp_chunk)
    except:
      print("fail on sentence!")
      continue
  return max_frazer_depth,frazer_chunk,frazer_depth
max_frazer_depth,frazer_chunk,frazer_depth=syntactic_analysis("I like to eat fish, and I also enjoy swimming.I like you as well.")

fail on sentence!


In [None]:
import nltk
from nltk.tree import Tree
from nltk.parse.corenlp import CoreNLPParser

def syntactic_analysis(text):
    sentences = nltk.sent_tokenize(text)
    yngve_depth = []
    left_branching = []

    for sentence in sentences:
      if(len(sentence)>180):
        continue
      try:
        result = list(parser.parse(sentence.split()))
        tree = result[0]

        def calculate_depth(tree, measure='yngve'):
            if measure == 'yngve':
                return calculate_yngve_depth(tree)
            elif measure == 'left_branching':
                return calculate_left_branching(tree)

        def calculate_yngve_depth(tree, depth=0):
            max_depth = depth
            for child in tree:
                if isinstance(child, Tree):
                    child_depth = calculate_yngve_depth(child, depth + 1)
                    max_depth = max(max_depth, child_depth)
            return max_depth

        def calculate_left_branching(tree, count=0):
            for i, child in enumerate(tree):
                if isinstance(child, Tree):
                    if i < len(tree) - 1:
                        count += 1
                    count = calculate_left_branching(child, count)
            return count

        yngve_depth.append(calculate_depth(tree, measure='yngve'))
        left_branching.append(calculate_depth(tree, measure='left_branching'))
      except Exception as e:
        print(e)
        continue
    return yngve_depth, left_branching


In [None]:
from nltk.tree import Tree
from nltk.parse.corenlp import CoreNLPParser

def calculate_t_unit_lengths(tree):
    t_unit_lengths = []

    def traverse_tree(subtree):
        nonlocal t_unit_lengths

        if isinstance(subtree, Tree):
            if subtree.label() in ('S', 'SINV', 'SQ', 'SBAR'):
                t_unit = subtree.leaves()
                t_unit_length = len(t_unit)
                t_unit_lengths.append(t_unit_length)
            else:
                for child in subtree:
                    traverse_tree(child)
    traverse_tree(tree)
    return t_unit_lengths

def cal_tunit(line1):
    line = line1.split('.')
    mean_t_unit = []

    for i in range(0, len(line)-1):
        print(line[i])
        parser = CoreNLPParser(url='http://localhost:9001')
        sentence = line[i]
        result = list(parser.raw_parse(sentence))
        tree = result[0]

        t_unit_lengths = calculate_t_unit_lengths(tree)
        mean_t_unit_length = sum(t_unit_lengths) / len(t_unit_lengths) if t_unit_lengths else 0
        mean_t_unit.append(mean_t_unit_length)
        print("complete")
    return mean_t_unit
print(cal_tunit("I like to eat fish, and I also enjoy swimming."))


In [None]:
from nltk.tree import Tree
from nltk.parse.corenlp import CoreNLPParser
fail_list=[]
nltk.download('punkt')
# Start the Stanford CoreNLP server first before running this script
parser = CoreNLPParser(url='http://localhost:4015')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
for i in range(0,len(filenames)):
  try:
    print(filenames[i])
    source_file_path='/content/drive/MyDrive/Colab Notebooks/term project/TED text/'+filenames[i][0:len(filenames[i])-4]+'.txt'
    f1 = open(source_file_path, "r")
    line1 = f1.readline()
    f1.close()
    frazer_chunk,max_frazer_depth,frazer_depth=syntactic_analysis(line1)
    #mean_Yngve_depth','total_Yngve_depth','max_Yngve_depth','mean_left_branching','total_left_branching','max_left_branching','frazer_chunk','max_frazer_depth','mean_frazer_depth']
    for j in range(0,len(old_data)):
      if(old_data[j][0]==filenames[i]):
        old_data[j].append(frazer_chunk)
        old_data[j].append(max_frazer_depth)
        old_data[j].append(np.mean(frazer_depth))
        #print("complete",filenames[i])
        print("complete!!!!!")
        break
  except:
    fail_list.append(filenames[i])
    print("fail in",filenames[i])
    continue

In [None]:
filename='AdamSavage_2008P.sph'
source_file_path='/content/drive/MyDrive/Colab Notebooks/term project/TED text/AdamSavage_2008P.txt'
f1 = open(source_file_path, "r")
line1 = f1.readline()
f1.close()
line1=line1.split('.')
for i in range(0,len(line1)):
  print(len(line1[i].split(' ')))
print(line1)

In [None]:
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java