In [1]:
# Installing and importing 
!pip install stanza
!pip install datasets

import stanza
import os
import pandas as pd
from datasets import load_dataset
from stanza.server import CoreNLPClient
from google.colab import files
from google.colab import output

# Download the Stanford CoreNLP package with Stanza's installation command
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
os.environ["CORENLP_HOME"] = corenlp_dir

# Get german models jar
!wget -P /content/corenlp https://nlp.stanford.edu/software/stanford-corenlp-4.5.0-models-german.jar

# Define pattern to search - NP means Noun-phrase
pattern = 'NP'

# Define function to extract Noun-phrase using CoreNLP
def findNP(corpus, language):

    # Define empty list to store a list of noun phrases
    text_NP = []

    # Using CoreNLPCLient
    with CoreNLPClient(language=language, timeout=30000, memory='16G', endpoint='http://localhost:9030') as client:
        
        print(language)
        # Iterating through each line of corpus
        for text in corpus:

            # Matching noun-phrase using tregex
            matches = client.tregex(text, pattern)

            # Empty list to store noun-phrase for a line
            NP_list = []

            # Iterate through each sentences in a line
            for sentence in matches['sentences']:

                # Iterate through all Noun phrases
                for match_id in sentence:

                    # Append Noun phrase in list
                    NP_list.append(sentence[match_id]['spanString'])
            
            # Append list of noun phrases for a line in another list
            text_NP.append(NP_list)
    
    output.clear()
    return text_NP

language_dict = {'english':'en',
                 'german':'de'}

def createData(language):
        
    # Create empty dataframe to store results
    dataframe = pd.DataFrame()

    # Load dataset in language
    dataset = load_dataset("xnli", language_dict[language])

    # Creating a corpus with 25 exaplmes from each language
    corpus = [dataset['train'][i]['hypothesis'] for i in range(0,25)]

    # Find all noun phrases in corpus
    text_NP = findNP(corpus, language)

    # Storing noun phrases in a dataframe
    dataframe['Text'] = pd.Series(corpus)
    dataframe['Noun_phrase'] = pd.Series(text_NP)

    output.clear()
    return dataframe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.4.0-py3-none-any.whl (574 kB)
[K     |████████████████████████████████| 574 kB 4.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 46.0 MB/s 
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 44.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 43.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.

2022-07-24 07:17:17 INFO: Installing CoreNLP package into ./corenlp


Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenlp-latest.zip:   0%|        …



--2022-07-24 07:17:46--  https://nlp.stanford.edu/software/stanford-corenlp-4.5.0-models-german.jar
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.0-models-german.jar [following]
--2022-07-24 07:17:47--  https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.0-models-german.jar
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 192183926 (183M) [application/java-archive]
Saving to: ‘/content/corenlp/stanford-corenlp-4.5.0-models-german.jar’


2022-07-24 07:18:21 (5.56 MB/s) - ‘/content/corenlp/stanford-corenlp-4.5.0-models-german.jar’ saved [192183926/192183926]



In [2]:
# Create dataframes
dataframe_en = createData('english')
dataframe_de = createData('german')
dataframe = pd.concat([dataframe_en, dataframe_de])

# Clear output
output.clear()

# Download dataframe as csv
dataframe.to_csv('result.csv')
files.download("result.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
# Testing on a single line
corpus = ['Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity.']
text_NP = findNP(corpus, 'english')
output.clear()
print("Noun phrase:", text_NP[0])

Noun phrase: ['Albert Einstein', 'a German-born theoretical physicist', 'He', 'the theory of relativity', 'the theory', 'relativity']
