### Dependencies

In [2]:
pip install lightning

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# file management
import sys
import shutil
import urllib
import tarfile
from pathlib import Path
import zipfile

# dataframe management
import pandas as pd

# data manipulation
import numpy as np

# for readability
from typing import Iterable
from tqdm import tqdm

### TASK 1: Corpus

Download the corpus

In [4]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=download_path, reporthook=t.update_to)

        
def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url=url, download_path=download_path)
    print("Download complete!")

def extract_dataset(download_path: Path, extract_path: Path):
    print("Extracting dataset... (it may take a while...)")
    with zipfile.ZipFile(download_path, 'r') as zip_file:
        zip_file.extractall(extract_path)

    print("Extraction completed!")

In [5]:
url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"
dataset_name = "dependency_treebank"

print(f"Current work directory: {Path.cwd()}")

dataset_folder = Path.cwd().joinpath("Datasets")

if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

dataset_zip_path = dataset_folder.joinpath("dependency_treebank.zip")
dataset_path = dataset_folder.joinpath(dataset_name)

if not dataset_zip_path.exists():
    download_dataset(dataset_zip_path, url)

if not dataset_path.exists():
    extract_dataset(dataset_zip_path, dataset_folder)
  

Current work directory: c:\Users\Acer\Dropbox\PC\Desktop\NLP_project\A1


Encoding the corpus into a pandas.DataFrame object

The corpus contains 200 documents.

   * **Train**: Documents 1-100
   * **Validation**: Documents 101-150
   * **Test**: Documents 151-199

In [6]:
dataframe_rows = []  # list for DataFrame rows
dataset_folder = Path('.\Datasets\dependency_treebank') #unzipped folder
for i, file_path in enumerate(dataset_folder.iterdir()):
    if file_path.is_file(): # split corpus documents in the tree cathegories: train, validation, tests
        if 1 <= i + 1 <= 100:
            split = 'train'
        elif 101 <= i + 1 <= 150:
            split = 'validation'
        else:
            split = 'test'

        with file_path.open(mode='r', encoding='utf-8') as text_file: # read corpus lines
                lines = text_file.readlines()

        if len(lines) > 0:
        # split the first line based on tabs
                    fields = lines[0].strip().split('\t')
        if len(fields) >= 2:
                text = fields[0]  # store the first field as 'text'
                POS = fields[1]   # store the second field as 'POS'

                dataframe_row = {  #build DataFrame rows
                    "text": text,
                    "POS": POS,
                    "split": split
                }

                dataframe_rows.append(dataframe_row) #append rows

# corpus DataFrame
corpus_df = pd.DataFrame(dataframe_rows) 

Data inspection

In [7]:
corpus_df.head()

Unnamed: 0,text,POS,split
0,Pierre,NNP,train
1,Rudolph,NNP,train
2,A,DT,train
3,Yields,NNS,train
4,J.P.,NNP,train


In [8]:
print("Dataframe structure:")
print(corpus_df)
print()

print("Total rows %d" % (len(corpus_df)))
print()

Dataframe structure:
        text  POS  split
0     Pierre  NNP  train
1    Rudolph  NNP  train
2          A   DT  train
3     Yields  NNS  train
4       J.P.  NNP  train
..       ...  ...    ...
194     John  NNP   test
195     Leon  NNP   test
196    David  NNP   test
197      Two   CD   test
198  Trinity  NNP   test

[199 rows x 3 columns]

Total rows 199



### TASK 2: Text encoding

Encode text into numerical format

In [9]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
