In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m112.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import re
import glob
import json
import tqdm
import pickle
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
!unzip ./drive/MyDrive/agribrain/corpus/batch.zip -d ./corpus

Archive:  ./drive/MyDrive/agribrain/corpus/batch.zip
  inflating: ./corpus/batch-01.tar.gz  
  inflating: ./corpus/batch-02.tar.gz  


In [None]:
import tarfile
with tarfile.open("./corpus/batch-01.tar.gz", "r:gz") as tar:
    tar.extractall("./corpus/")

with tarfile.open("./corpus/batch-02.tar.gz", "r:gz") as tar:
    tar.extractall("./corpus/")    

In [None]:
class TextFilesParser:
    def __init__(self, source_dir=None, output_dir=None, extensions=[".txt"]):
        output_dir = output_dir if output_dir else "./txt-parser-ouputs/"
        assert type(output_dir) == str
        output_dir = re.sub(r"\s+", "_", output_dir)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if source_dir:
            for file in os.listdir(source_dir):
                file_path = os.path.join(source_dir, os.path.basename(file))
                if not os.path.isfile(file_path): continue
                file_name, fext = os.path.splitext(file)
                if not fext in extensions: continue
                output_file = os.path.join(output_dir, file_name + fext)
                with open(file_path, mode="r", encoding="utf-8") as context:
                    content = context.read()
                    
                parser_content = self.paragraph_parser(content)
                if not parser_content is None and type(parser_content) == str:
                    with open(output_file, mode="w", encoding="utf-8") as fp:
                        fp.write(parser_content)
                        fp.close()
        return None
    
    def line_parser(self, line):
        line = re.sub(r"\s+", " ", line)
        line = re.sub(r"\t+", " ", line)
        line = re.sub(r"www?(?:[-\w.]|(?:%[\da-fA-F]{2}))+", '[URL_ADDRESS]', line)
        line = re.sub(r'^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*(\.[a-zA-Z]{2,})$', 
                      '[EMAIL_ADDRESS]', line)
        if len(line.split("@")) >=2: return None
        if line and not line == " ":
            return line
        
    def paragraph_parser(self, corpus):
        # removing any non utf-8 characters
        non_utf8 = pattern = re.compile(r'[^\x00-\x7F]')
        corpus = non_utf8.sub("", corpus)

        # split the corpus into paragraphs
        paragraphs = corpus.split("\n\n")
        new_paragraphs = []
        for paragraph in paragraphs:
            paragraph = paragraph.strip()
            sents = [self.line_parser(line) for line in paragraph.split("\n") if self.line_parser(line)]       
            
            num_sents = len(sents)
            if not num_sents > 8:
                continue

            average_num_words_track = []
            for sent in sents:
                average_num_words_track.append(len(sent.split()))
                
            # Average words in a sentence
            average_words = np.mean(np.array(average_num_words_track))
            
            # Average symbols stops in a sentence
            symbols_list = [".", ",", ":", ";"]
            average_symbols_count = 0
            for symb in symbols_list:
                average_symbols_count += len(paragraph.split(symb)) / num_sents
            if average_symbols_count > 0:
                if (average_symbols_count / len(symbols_list)) >= 3:
                    continue
                    
            if not int(average_words) >= 8:
                continue
            paragraph = "".join(sents)
            paragraph = re.sub(r"www?(?:[-\w.]|(?:%[\da-fA-F]{2}))+", 
                               '[URL_ADDRESS]', paragraph)
            paragraph = re.sub(r'^[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*(\.[a-zA-Z]{2,})$', 
                          '[EMAIL_ADDRESS]', paragraph)            
            new_paragraphs.append(paragraph)
        if len(new_paragraphs) > 0:
            return "\n\n".join(new_paragraphs)       

In [None]:
class TextFileClusters(KMeans):
    def __init__(self, source_dir=None, output_dir=None, n_clusters=5, random_state=42, max_iter=1000000):
        
        output_dir = output_dir if output_dir else "./txt-cluster-output/"
        assert type(output_dir) == str
        self.output_dir = re.sub(r"\s+", "_", output_dir)
        
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.max_iter = max_iter
        self.source_dir = source_dir
        
        assert os.path.exists(self.source_dir), "self.source_dir not found"
        
        self.files_kmeans = KMeans(n_clusters=self.n_clusters, 
                                   random_state=self.random_state, 
                                   n_init="auto", 
                                   max_iter=self.max_iter)
        
        self.files_text_vectorizer = TfidfVectorizer(stop_words="english")
        
        data = self._get_text_data()
        lbls = self._kmeans_clustering(data) ###
        self._save_clusters(lbls)
        
        return None
    
    def save_components(self, save_dir: str, overwrite_dir=False):
        if os.path.exists(save_dir) and not overwrite_dir:
            raise Exception("""
            K-model: {} exists. Select a different name or set to overwrite contents.
            """.format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        model_path = os.path.join(save_dir, "kmeans-model.sav")
        vectorizer_path = os.path.join(save_dir, "text-vectorizer.sav")
        pickle.dump(self.files_kmeans, open(model_path, mode="wb"))
        pickle.dump(self.files_text_vectorizer, open(vectorizer_path, mode="wb"))
        print("Model, {} saved successfully.".format(save_dir))
    
    def _get_text_data(self):
        textdata = []
        for filepath in glob.glob(os.path.join(self.source_dir, "*.txt")):
            with open(filepath, mode="r", encoding="utf-8") as context:
                textdata.append(context.read())
        return textdata
    
    def _kmeans_clustering(self, text_data):
        # Perform k-means clustering on text data
        inputs = self.files_text_vectorizer.fit_transform(text_data)
        
        self.files_kmeans.fit(inputs)
        
        cluster_labels = self.files_kmeans.labels_
        
        return cluster_labels    
    
    def _save_clusters(self, cluster_labels):
        # Copy files to cluster directories
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        for i in range(max(cluster_labels) + 1):            
            cluster_dir = os.path.join(self.output_dir, f"cluster-{i}")
            if not os.path.exists(cluster_dir):
                os.mkdir(cluster_dir)
            for j, filename in enumerate(glob.glob(os.path.join(self.source_dir, '*.txt'))):
                if cluster_labels[j] == i:
                    outfile = os.path.join(cluster_dir, os.path.basename(filename))
                    with open(filename, mode="r", encoding="utf-8") as readctx, \
                    open(outfile, mode="w", encoding="utf-8") as writectx:
                        writectx.write(readctx.read())  

In [None]:
# Preprocessing the text files
TextFilesParser(source_dir="./corpus/batch-01/", output_dir="./drive/MyDrive/agribrain/processed-corpus/batch-01/")
TextFilesParser(source_dir="./corpus/batch-02/", output_dir="./drive/MyDrive/agribrain/processed-corpus/batch-02/")

<__main__.TextFilesParser at 0x7f73139b5850>

In [None]:
!mkdir ./drive/MyDrive/agribrain/processed-corpus-collection/
!cp ./drive/MyDrive/agribrain/processed-corpus/batch-01/*.txt ./drive/MyDrive/agribrain/processed-corpus-collection/
!cp ./drive/MyDrive/agribrain/processed-corpus/batch-02/*.txt ./drive/MyDrive/agribrain/processed-corpus-collection/

In [None]:
# Cluster the text files
kmodel = TextFileClusters("./drive/MyDrive/agribrain/processed-corpus-collection/", 
                          "./drive/MyDrive/agribrain/clustered-corpus/", 
                          random_state=42, 
                          max_iter=10000000, 
                          n_clusters=128)

In [None]:
kmodel.files_kmeans.predict(
    kmodel.files_text_vectorizer.transform(["Tea farming"])
)

array([44], dtype=int32)

In [None]:
kmodel.save_components("./drive/MyDrive/agribrain/kmodels/K-files-cluster/", overwrite_dir=True)

Model, ./drive/MyDrive/agribrain/kmodels/K-files-cluster/ saved successfully.


In [None]:
def load_kmodel(model_name):
    assert os.path.exists(model_name), "Model does not exist"
    model_path = os.path.join(model_name, "kmeans-model.sav")
    kmeans_model = pickle.load(open(model_path, mode="rb"))
    return kmeans_model

def load_kvec(model_name):
    assert os.path.exists(model_name), "Model does not exist"
    vectorizer_path = os.path.join(model_name, "text-vectorizer.sav")
    vectorizer = pickle.load(open(vectorizer_path, mode="rb"))
    return vectorizer

In [None]:
vectorizer = load_kvec("./drive/MyDrive/agribrain/kmodels/K-files-cluster/")
model = load_kmodel("./drive/MyDrive/agribrain/kmodels/K-files-cluster/")

In [None]:
model.predict(vectorizer.transform(["Tea farming"]))

array([44], dtype=int32)

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

In [None]:
tokenizer = BartTokenizer.from_pretrained("philschmid/bart-large-cnn-samsum")
model = BartForConditionalGeneration.from_pretrained("philschmid/bart-large-cnn-samsum")

In [None]:
def generate_summary(input_text):
  input_ids = tokenizer.encode(input_text, max_length=1024, truncation=True, return_tensors='pt')

  # Generate the summary
  summary_ids = model.generate(input_ids, max_length=480, min_length=10, num_beams=4, early_stopping=True)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  return summary

In [None]:
!mkdir ./drive/MyDrive/agribrain/processed-corpus-summary/

In [None]:
dest_dir = "./drive/MyDrive/agribrain/processed-corpus-summary/"

def mass_corpus_summarization(source_dir, dest_dir):
  if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)
  for filename in os.listdir(source_dir):
    file_path = os.path.join(source_dir, filename)
    fn, fe = os.path.splitext(filename)
    if not fe == ".txt":
      continue
    with open(file_path, mode="r", encoding="utf-8") as rctx:
      content = rctx.read()

    summary_file = os.path.join(dest_dir, filename)
    with open(summary_file, mode="w", encoding="utf-8") as wctx:
      wctx.write(generate_summary(content))

In [None]:
testing_text = """
  This is some sample text that I want to summarize. 
  It is not very long, but it should be enough to test the function.
"""
summary = generate_summary(testing_text)
print(summary)

The sample text is enough to test the function.
