<a href="https://colab.research.google.com/github/ambrecht/PG-Test-Api/blob/main/nlp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import PyPDF2
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('german')



# Extract text from PDF file with PYPDF2
def extract_pdf_text(pdf_file_path):
  from google.colab import files
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)

  print (pdf.pages)

  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text() 
  return pdf_text



# Extract words and POS tags from text
def extract_words(pdf_text):
  pdf_words = word_tokenize(pdf_text, language='german')
  pdf_tagged_words = nltk.pos_tag(pdf_words, lang='de')

  substantive = []
  adverbien = []
  verben = []
  adjektive = []

  for word, tag in pdf_tagged_words:
    if tag.startswith("NN"):
        substantive.append(word)
    elif tag.startswith("RB"):
        adverbien.append(word)
    elif tag.startswith("VB"):
        verben.append(word)
    elif tag.startswith("JJ"):
        adjektive.append(word)
  return substantive, adverbien, verben, adjektive

# Create dataframes for each POS category
def create_dataframes(substantive, adverbien, verben, adjektive):
  fd_substantive = FreqDist(substantive)
  fd_adverbien = FreqDist(adverbien)
  fd_verben = FreqDist(verben)
  fd_adjektive = FreqDist(adjektive)

  df_substantive = pd.DataFrame(fd_substantive.items(), columns=['Wort', 'Häufigkeit'])
  df_adverbien = pd.DataFrame(fd_adverbien.items(), columns=['Wort', 'Häufigkeit'])
  df_verben = pd.DataFrame(fd_verben.items(), columns=['Wort', 'Häufigkeit'])
  df_adjektive = pd.DataFrame(fd_adjektive.items(), columns=['Wort', 'Häufigkeit'])

  sort_column = input("Sortieren nach (a)lphabetisch oder (h)äufigkeit: ")
  if sort_column == "a":
    df_substantive.sort_values("Wort", inplace=True)
    df_adverbien.sort_values("Wort", inplace=True)
    df_verben.sort_values("Wort", inplace=True)
    df_adjektive.sort_values("Wort", inplace=True)
    print("Tabelle alphabetisch sortiert")
  elif sort_column == "h":
    df_substantive.sort_values("Häufigkeit", inplace=True, ascending=False)
    df_adverbien.sort_values("Häufigkeit", inplace=True, ascending=False)
    df_verben.sort_values("Häufigkeit", inplace=True, ascending=False)
    df_adjektive.sort_values("Häufigkeit", inplace=True, ascending=False)
    print("Tabelle häufigkeitssortiert")

    print(df_substantive)
    print(df_adverbien)
    print(df_verben)
    print(df_adjektive)

  return df_substantive, df_adverbien, df_verben, df_adjektive

def save_dataframes_as_json(df_substantive, df_adverbien, df_verben, df_adjektive):
    df_substantive.to_json("substantive.json")
    df_adverbien.to_json("adverbien.json")
    df_verben.to_json("verben.json")
    df_adjektive.to_json("adjektive.json")



# Main function
def main():
  pdf_text = extract_pdf_text("path/to/pdf_file.pdf")
  substantive, adverbien, verben, adjektive = extract_words(pdf_text)
  df_substantive, df_adverbien, df_verben, df_adjektive = create_dataframes(substantive, adverbien, verben, adjektive)
  print(df_substantive)
  print(df_adverbien)
  print(df_verben)
  print(df_adjektive)
  save_dataframes_as_json(df_substantive, df_adverbien, df_verben, df_adjektive)

#Create a bar chart for each POS category
def create_bar_chart(df, title):
    df.plot.bar(x='Wort', y='Häufigkeit', rot=0, title=title)
    plt.show()

# Call main function
if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Error loading german: Package 'german' not found in index


KeyboardInterrupt: ignored

In [None]:
!pip install PyPDF2
!pip install spacy
!pip install pandas
!pip install -U spacy[displacy]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[displacy]
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: 

In [None]:
!pip install spacy
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-01-31 07:46:54.463106: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.5.0/de_core_news_sm-3.5.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [None]:
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")
nlp.max_length = 10000000


# Extract text from PDF file with PYPDF2
def extract_pdf_text(pdf_file_path):
  from google.colab import files
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_file_name = list(uploaded.keys())[0]

  print(pdf.pages)

  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text


# Extract words and POS tags from text
def extract_words(pdf_text):
  pdf_doc = nlp(pdf_text)
  pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]

  substantive = []
  adverbien = []
  verben = []
  adjektive = []

  for word, tag in pdf_tagged_words:
    if tag == "NOUN":
        substantive.append(word)
    elif tag == "ADV":
        adverbien.append(word)
    elif tag == "VERB":
        verben.append(word)
    elif tag == "ADJ":
        adjektive.append(word)
  return substantive, adverbien, verben, adjektive

# Create dataframes for each POS category


def create_dataframes(substantive, adverbien, verben, adjektive):
    fd_substantive = Counter(substantive)
    fd_adverbien = Counter(adverbien)
    fd_verben = Counter(verben)
    fd_adjektive = Counter(adjektive)

    df_substantive = pd.DataFrame.from_dict(
        fd_substantive, orient='index', columns=['Häufigkeit'])
    df_adverbien = pd.DataFrame.from_dict(
        fd_adverbien, orient='index', columns=['Häufigkeit'])
    df_verben = pd.DataFrame.from_dict(
        fd_verben, orient='index', columns=['Häufigkeit'])
    df_adjektive = pd.DataFrame.from_dict(
        fd_adjektive, orient='index', columns=['Häufigkeit'])

    sort_column = input("Sortieren nach (a)lphabetisch oder (h)äufigkeit: ")
    if sort_column == "a":
        df_substantive.sort_index(inplace=True)
        df_adverbien.sort_index(inplace=True)
        df_verben.sort_index(inplace=True)
        df_adjektive.sort_index(inplace=True)
        print("Tabelle alphabetisch sortiert")
    elif sort_column == "h":
        df_substantive.sort_values("Häufigkeit", inplace=True, ascending=False)
        df_adverbien.sort_values("Häufigkeit", inplace=True, ascending=False)
        df_verben.sort_values("Häufigkeit", inplace=True, ascending=False)
        df_adjektive.sort_values("Häufigkeit", inplace=True, ascending=False)
        print("Tabelle häufigkeitssortiert")

       

    return df_substantive, df_adverbien, df_verben, df_adjektive

import json

def save_dataframes_as_json(pdf_file_name, df_substantive, df_adverbien, df_verben, df_adjektive):
    import os
    pdf_file_name = pdf_file_name.split(".")[0] # remove the file extension
    if not os.path.exists(pdf_file_name):
        os.makedirs(pdf_file_name)

    df_substantive.columns = ["Wort", "Häufigkeit"]
    df_adverbien.columns = ["Wort", "Häufigkeit"]
    df_verben.columns = ["Wort", "Häufigkeit"]
    df_adjektive.columns = ["Wort", "Häufigkeit"]

    data = {
        "substantive": df_substantive.to_dict("records"),
        "adverbien": df_adverbien.to_dict("records"),
        "verben": df_verben.to_dict("records"),
        "adjektive": df_adjektive.to_dict("records"),
    }
    with open(f"{pdf_file_name}/data.json", "w") as json_file:
        json.dump(data, json_file)



# Main function
def main():
  pdf_file_name = "kafka2.pdf"
  pdf_text = extract_pdf_text("path/to/pdf_file.pdf")
  pdf_file_path = pdf_file_name.split(".")[0] # remove the file extension
  substantive, adverbien, verben, adjektive = extract_words(pdf_text)
  df_substantive, df_adverbien, df_verben, df_adjektive = create_dataframes(substantive, adverbien, verben, adjektive)
  pdf_file_name = pdf_file_path.split(".")[0]
  print(df_substantive)
  print(df_adverbien)
  print(df_verben)
  print(df_adjektive)
  save_dataframes_as_json(pdf_file_name, df_substantive, df_adverbien, df_verben, df_adjektive)



# Call main function
if __name__ == "__main__":
    main()



Saving Der große Gatsby (F. Scott Fitzgerald, Bettina Abarnell) (z-lib.org).pdf to Der große Gatsby (F. Scott Fitzgerald, Bettina Abarnell) (z-lib.org).pdf
<PyPDF2._page._VirtualList object at 0x7f0dcb478cd0>
Sortieren nach (a)lphabetisch oder (h)äufigkeit: h
Tabelle häufigkeitssortiert
                  Häufigkeit
Tom                       90
Gatsby                    86
Mr.                       82
Mann                      73
Hand                      68
...                      ...
Verdeck                    1
Affäre                     1
Entrüstungssturm           1
Vorschlußrunde             1
Vergangenen                1

[3679 rows x 1 columns]
           Häufigkeit
so                197
noch              195
nur               159
dann              121
schon             118
...               ...
Darum               1
strikt              1
geradeaus           1
bedächtig           1
stetig              1

[1537 rows x 1 columns]
            Häufigkeit
sagte              222
sah 

ValueError: ignored

In [None]:
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
  print("Extracting text from PDF file...")
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

def extract_words(pdf_text):
  print("Extracting words from PDF text...")
  pdf_doc = nlp(pdf_text)
  pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]
  types = []
  for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
    words = [word for word, pos in pdf_tagged_words if pos == tag]
    word_count = dict(Counter(words))
    types.append({tag.lower(): word_count})
  return types

def save_to_json(types, json_file_path):
  print("Saving extracted data to JSON file...")
  with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")

#Additional code to download the JSON file
from google.colab import files
files.download('pos_count.json')

In [None]:
import spacy
import json
from collections import Counter
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
  print("Extracting text from PDF file...")
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

def extract_words(pdf_text):
  print("Extracting words from PDF text...")
  pdf_doc = nlp(pdf_text)
  pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]
  types = [
    {'nomen': []},
    {'verben': []},
    {'adjektive': []},
    {'adverbien': []},
    {'pronomen': []},
    {'präpositionen': []},
    {'konjunktionen': []},
    {'interjektionen': []}
    ]
  for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
    words = [word for word, pos in pdf_tagged_words if pos == tag]
    for word in words:
        if word not in types[tag.lower()]:
            types[tag.lower()].append(word)
  return types

def save_to_json(types, json_file_path):
  print("Saving extracted data to JSON file...")
  with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")

#Additional code to download the JSON file
from google.colab import files
files.download('pos_count.json')

In [None]:
import spacy
import json
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
  print("Extracting text from PDF file...")
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

def extract_words(pdf_text):
  print("Extracting words from PDF text...")
  pdf_doc = nlp(pdf_text)
  pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]
  types = {
    'NOUN': [],
    'VERB': [],
    'ADJ': [],
    'ADV': [],
    'PRON': [],
    'ADP': [],
    'CONJ': [],
    'INTJ': []
  }
  for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
    words = [word for word, pos in pdf_tagged_words if pos == tag]
    for word in words:
      if word not in types[tag]:
        types[tag].append(word)
  return types

def save_to_json(types, json_file_path):
  print("Saving extracted data to JSON file...")
  with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")

#Additional code to download the JSON file
from google.colab import files
files.download('pos_count.json')


In [None]:
import spacy
import json
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")


def extract_pdf_text(pdf_file_path):
    print("Extracting text from PDF file...")
    uploaded = files.upload()
    pdf_file = list(uploaded.keys())[0]
    pdf = PyPDF2.PdfReader(pdf_file)
    pdf_text = ""
    for page in range(len(pdf.pages)):
        pdf_text += pdf.pages[page].extract_text()
    return pdf_text


def extract_words(pdf_text):

    print("Extracting words from PDF text...")
    pdf_doc = nlp(pdf_text)
    pdf_tagged_words = [(token.text, token.pos_) for token in pdf_doc]
    types = {
        'NOUN': [],
        'VERB': [],
        'ADJ': [],
        'ADV': [],
        'PRON': [],
        'ADP': [],
        'CONJ': [],
        'INTJ': []
    }
    for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
        words = [token.text for token in pdf_doc if token.pos_ == tag]
        for word in words:
            if word not in types[tag]:
                types[tag].append(word)
    return types


def save_to_json(types, json_file_path):
    print("Saving extracted data to JSON file...")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)


pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")

# Additional code to download the JSON file
files.download('pos_count.json')


Extracting text from PDF file...


In [None]:
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
    print("Extracting text from PDF file...")
    uploaded = files.upload()
    pdf_file = list(uploaded.keys())[0]
    pdf = PyPDF2.PdfReader(pdf_file)
    pdf_text = ""
    for page in range(len(pdf.pages)):
        pdf_text += pdf.pages[page].extract_text()
    return pdf_text

def extract_words(pdf_text):
    print("Extracting words from PDF text...")
    pdf_doc = nlp(pdf_text)
    pdf_words = [token.text for token in pdf_doc]
    types = {
        'NOUN': [],
        'VERB': [],
        'ADJ': [],
        'ADV': [],
        'PRON': [],
        'ADP': [],
        'CONJ': [],
        'INTJ': []
    }
    for tag in types.keys():
        words = [token.text for token in pdf_doc if token.pos_ == tag]
        word_counts = dict(Counter(words))
        types[tag] = [{'Wort': word, 'Häufigkeit': count}
                      for word, count in word_counts.items()]
    return types

def save_to_json(types, json_file_path):
    print("Saving extracted data to JSON file...")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")

Extracting text from PDF file...


Saving Michael Kohlhaas (Heinrich von Kleist) (z-lib.org).pdf to Michael Kohlhaas (Heinrich von Kleist) (z-lib.org) (1).pdf
Extracting words from PDF text...
Saving extracted data to JSON file...


In [None]:
import json
from google.colab import files
import spacy
import json
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
  print("Extracting text from PDF file...")
  pdf = PyPDF2.PdfReader(pdf_file_path)
  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

def extract_words_and_add_to_json(json_file_path):
    pdf_file = files.upload()
    pdf_file = list(pdf_file.keys())[0]
    pdf_text = extract_pdf_text(pdf_file)
    pdf_doc = nlp(pdf_text)
    pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        types = json.load(json_file)

    for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
        words = [word for word, pos in pdf_tagged_words if pos == tag]
        for word in words:
            if word not in types[tag]:
                types[tag].append(word)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)

json_file = files.upload()
json_file = list(json_file.keys())[0]
extract_words_and_add_to_json(json_file)


KeyboardInterrupt: ignored

In [None]:
!pip install spacy
!python -m spacy download de_core_news_sm


In [None]:
import json
from google.colab import files
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
  print("Extracting text from PDF file...")
  pdf = PyPDF2.PdfReader(pdf_file_path)
  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

def extract_words_and_add_to_json(json_file_path):
    pdf_file = files.upload()
    pdf_file = list(pdf_file.keys())[0]
    pdf_text = extract_pdf_text(pdf_file)
    pdf_doc = nlp(pdf_text)
    pdf_tagged_words = [(word.text, word.pos_) for word in pdf_doc]
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        types = json.load(json_file)

    for tag in ["NOUN", "ADV", "VERB", "ADJ", "PRON", "ADP", "CONJ", "INTJ"]:
        words = [word for word, pos in pdf_tagged_words if pos == tag]
        for word in words:
            if word not in types[tag]:
                types[tag].append(word)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)

json_file = files.upload()
json_file = list(json_file.keys())[0]
extract_words_and_add_to_json(json_file)


Saving Paul Häberlin (auth.) - Philosophia Perennis_ Eine Zusammenfassung-Springer-Verlag Berlin Heidelberg (1952).pdf to Paul Häberlin (auth.) - Philosophia Perennis_ Eine Zusammenfassung-Springer-Verlag Berlin Heidelberg (1952).pdf


IndexError: ignored

In [None]:
!pip install PyPDF2
!pip install spacy
!pip install pandas
!pip install -U spacy[displacy]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[displacy]
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: 

In [None]:
!pip install PyPDF2
!pip install spacy
!pip install pandas
!pip install -U spacy[displacy]

In [None]:
pip install transformers

In [None]:
import PyPDF2
import spacy
from spacy import displacy
from collections import Counter
import pandas as pd
import transformers
from google.colab import files



# Extract text from PDF file with PYPDF2
def extract_pdf_text(pdf_file_path):
  from google.colab import files
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_file_name = list(uploaded.keys())[0]

  print(pdf.pages)

  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

 

def summarize_philosophical_pdf(pdf_path):
    # extract text from pdf
    text = extract_pdf_text("path/to/pdf_file.pdf");

    
    # load the pre-trained model
    model = transformers.PegasusForCausalLM.from_pretrained("google/pegasus-german-cased-generating")
    # generate summary
    summary = model.generate(text, 
                             max_length=150, 
                             min_length=30, 
                             length_penalty=2.0, 
                             num_beams=4, 
                             early_stopping=True)
    return summary.strip()


In [None]:
import PyPDF2
import transformers
from google.colab import files

def extract_pdf_text():
  uploaded = files.upload()
  pdf_file = list(uploaded.keys())[0]
  pdf = PyPDF2.PdfReader(pdf_file)
  pdf_file_name = list(uploaded.keys())[0]

  print(pdf.pages)

  pdf_text = ""
  for page in range(len(pdf.pages)):
      pdf_text += pdf.pages[page].extract_text()
  return pdf_text

text = extract_pdf_text()

def summarize_philosophical_pdf(t):
    # Extraktion des Texts aus der hochgeladenen PDF-Datei

    # Laden des vortrainierten Modells
    model = transformers.PegasusForCausalLM.from_pretrained("google/pegasus-german-cased-generating")
    # Generieren der Zusammenfassung
    summary = model.generate(text,
                             max_length=150,
                             min_length=30,
                             length_penalty=2.0,
                             num_beams=4,
                             early_stopping=True)
    return summary.strip()

print(summarize_philosophical_pdf(text))



Saving Volltext (PDF).pdf to Volltext (PDF) (2).pdf
<PyPDF2._page._VirtualList object at 0x7f01013f19d0>


OSError: ignored

In [None]:
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
    print("Extracting text from PDF file...")
    uploaded = files.upload()
    pdf_file = list(uploaded.keys())[0]
    pdf = PyPDF2.PdfReader(pdf_file)
    pdf_text = ""
    for page in range(len(pdf.pages)):
        pdf_text += pdf.pages[page].extract_text()
    return pdf_text


def extract_words(text):
    doc = nlp(text)
    phrasetree = {"document": []}
    paragraph = {"paragraph": [], "id": 1}
    sent_idx = 1
    word_idx = 1
    for sent in doc.sents:
        sentence = {"sentence": [], "id": sent_idx}
        for token in sent:
            sentence["sentence"].append({
                "text": token.text,
                "pos": token.pos_,
                "dep": token.dep_,
                "head": token.head.text,
                "id": word_idx
            })
            word_idx += 1
        paragraph["paragraph"].append(sentence)
        sent_idx += 1
        if sent.text.strip().endswith("\n"):
            phrasetree["document"].append(paragraph)
            paragraph = {"paragraph": [], "id": sent_idx}
    phrasetree["document"].append(paragraph)
    return phrasetree


def save_to_json(types, json_file_path):
    print("Saving extracted data to JSON file...")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = extract_words(pdf_text)
save_to_json(types, "pos_count.json")



Extracting text from PDF file...


Saving Der große Gatsby (F. Scott Fitzgerald, Bettina Abarnell) (z-lib.org).pdf to Der große Gatsby (F. Scott Fitzgerald, Bettina Abarnell) (z-lib.org).pdf
Saving extracted data to JSON file...


In [None]:
import spacy
import json
from collections import Counter
from google.colab import files
import PyPDF2

nlp = spacy.load("de_core_news_sm")

def extract_pdf_text(pdf_file_path):
    pdf_file = "/content/Michael Kohlhaas (Heinrich von Kleist) (z-lib.org) (9).pdf"
    pdf = PyPDF2.PdfReader(pdf_file)
    pdf_text = ""
    for page in range(len(pdf.pages)):
        pdf_text += pdf.pages[page].extract_text()
    return pdf_text

def constituency_parsing(text):
    doc = nlp(text)
    parse_tree = []
    for sent in doc.sents:
        sent_tree = []
        for token in sent:
            sent_tree.append({"text": token.text, "pos": token.pos_, "dep": token.dep_, "head": token.head.text})
        parse_tree.append({"sentence": sent_tree})
    return {"document": parse_tree}

def split_paragraphs(pdf_text):
    nlp = spacy.load("de_core_news_sm")
    doc = nlp(pdf_text)
    paragraphs = [str(sent.text) for sent in doc.sents]
    print("PARA",paragraphs)
    return paragraphs



def save_to_json(types, json_file_path):
    print("Saving extracted data to JSON file...")
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(types, json_file, ensure_ascii=False)

pdf_text = extract_pdf_text("file.pdf")
types = split_paragraphs(pdf_text)
save_to_json(types, "kohl.json")


PARA [' \n \n ', 'Heinrich von Kleist  \nMichael Kohlhaas  \nrevised  by AnyBody  \n', 'Die Handlung spielt um die Mitte des 16. Jahrhunderts in Brandenburg und \nSachsen.', 'Der Pferdehändler Michael Kohlhaas gerät in einen Rechtsstreit \nmit dem Junker Wenzel von Tronka, der widerrechtlich  zwei Pferde von ihm \neinbehält und sie zu Grunde richtet.', 'Kohlhaas’ Anrufung der Gerichte bleibt \ninfolge der Intrigen von Tronkas Verwandten erfolglos.', 'Als letztes Mittel \nversucht er sein Recht durch Rebellion zu erlangen.', 'Nach seinem Überfall \nauf Wittenbe rg vermittelt Martin Luther einen Kompromiss mit dem \nKurfürsten von Sachsen, an den sich zwar Kohlhaas, aber nicht der Kurfürst \nhält.', 'Schließlich geht der Fall an Kohlhaas’ Landesherrn, den Kurfürsten \nvon Brandenburg, der einerseits der Klage gegen Tronka stattgibt, \nandererseits aber Kohlhaas wegen Aufruhr zum Tod verurteilt.', 'Kurz vor \nseiner Hinrichtung vernichtet der Pferdehändler vor den Augen des \nKurfürsten v

In [None]:
import spacy
import matplotlib.pyplot as plt
from spacy import displacy

nlp = spacy.load("de_core_news_sm")

def visualize_constituency_parsing(text):
    doc = nlp(text)
    displacy.render(doc, style="dep", jupyter=True, options={'distance': 90})
    plt.show()

text = " Wenn ich Joseph aus dem Brunnen rette, fliege ich mit ihm vom Fisch zum Mond « Der Wiedehopf sprach: »Oh, du bist von Schönheit und Feinheit und selbst im Sturz hundertfach stolz"
visualize_constituency_parsing(text)


In [None]:
!pip install spacypdfreader
import spacy


def extract_pdf_text(pdf_file_path):
    pdf_file = "/content/Michael Kohlhaas (Heinrich von Kleist) (z-lib.org) (9).pdf"
    pdf = PyPDF2.PdfReader(pdf_file)
    pdf_text = ""
    for page in range(len(pdf.pages)):
        pdf_text += pdf.pages[page].extract_text()
    return pdf_text

def split_paragraphs(pdf_text):
    nlp = spacy.load("de_core_news_sm")
    doc = nlp(text)
    paragraphs = [str(sent.text) for sent in doc.sents]
    return paragraphs


pdf_text = extract_pdf_text("/content/Michael Kohlhaas (Heinrich von Kleist) (z-lib.org) (9).pdf")
print(split_paragraphs(pdf_text))


[' ', 'Wenn ich Joseph aus dem Brunnen rette, fliege ich mit ihm vom Fisch zum Mond «', 'Der Wiedehopf sprach:', '»Oh, du bist von Schönheit und Feinheit und selbst im Sturz hundertfach stolz']


In [44]:
!pip install python-doctr
!pip install "python-doctr[tf]"
!pip install "python-doctr[torch]"

from doctr.io import DocumentFile
from doctr.models import ocr_predictormodel = ocr_predictor(det_arch = ’db_resnet50',    
                      reco_arch = ’crnn_vgg16_bn’, 
                      pretrained = True
                     )

from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)
# PDF
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
# Analyze
result = model(doc)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-doctr
  Using cached python_doctr-0.6.0-py3-none-any.whl (239 kB)
Installing collected packages: python-doctr
Successfully installed python-doctr-0.6.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons>=0.17.1
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tf2onnx>=1.9.2
  Downloading tf2onnx-1.13.0-py3-none-any.whl (442 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.3/442.3 KB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Collecting onnx>=1.4.1
  Downloading onnx-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
  Downloading onnx-1.12.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m48.5 MB/s[0m et

ModuleNotFoundError: ignored