In [1]:
import pandas as pd
import re
import os
from ensure import ensure_annotations
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from keras.layers import TextVectorization
import gc


In [2]:
def getData( path: Path) -> pd.DataFrame:
    data = []
    try:
        print(f"CWD ====={os.getcwd()}")
        # Collect all the training files from raw data
        for file_path in path.iterdir():
            df = pd.read_csv(file_path)
            data.append(df)
        df = pd.concat(data)
        return df
    except Exception as e:
        print(e)

In [3]:
def loadData( train: bool = False) -> pd.DataFrame:
    try:
        if train:
            train_df = getData(Path("c:/Users/ANOOP/Desktop/Python/ai-porfolio/llm-text-detection/src/llmTextDetection/data/train"))
            train_df["label_name"] = train_df["label"].map({0:"Human",1:"AI"})
            return train_df
        else:
            test_df = getData(Path("c:/Users/ANOOP/Desktop/Python/ai-porfolio/llm-text-detection/src/llmTextDetection/data/test"))
            return test_df
    except Exception as e:
        print(e)

In [4]:
train_df = loadData(train=True)

CWD =====c:\Users\ANOOP\Desktop\Python\ai-porfolio\llm-text-detection\src\llmTextDetection\notebooks


In [5]:
len(train_df)

44868

In [6]:
def getRegexExclusions(df: pd.DataFrame) -> tuple:
    # Regex pattern for excluding emojis
    exclude = re.compile(
        "["
        "\u000A"  # new-line
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese char
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        re.UNICODE,
    )
    regex_pattern = exclude.pattern

    # Regex pattern for excluding html tags
    html_exclude = re.compile(r"<.*?>")
    html_pattern = html_exclude.pattern

    return ( regex_pattern, html_pattern)

regex_pattern, html_pattern = getRegexExclusions(train_df)
def standardizeText( input_data):
    data = tf.strings.lower(input_data)
    data = tf.strings.regex_replace(data, regex_pattern, "")
    data = tf.strings.regex_replace(data, html_pattern, "")
    return data

In [7]:
regex_pattern,html_pattern
# tf.keras.utils.get_custom_objects()['standardizeText'] = standardizeText

('[\n😀-🙏🌀-🗿🚀-\U0001f6ff\U0001f1e0-🇿─-⯯✂-➰Ⓜ-🉑🤦-🤷𐀀-\U0010ffff♀-♂☀-⭕\u200d⏏⏩⌚️〰]+',
 '<.*?>')

In [12]:
def buildVectorizationLayer(texts: list, df: pd.DataFrame):
        # Get Regex patterns
        (
            regex_pattern,
            html_exclude,
        ) = getRegexExclusions(df)

        vectorization_layer = TextVectorization(
            # standardize=standardizeText,
            max_tokens=35000,
            output_mode="int",
            output_sequence_length=250,
            input_shape = (1,)
        )
        train_text = tf.data.Dataset.from_tensor_slices(texts)
        train_text = train_text.map(standardizeText)

        
        vectorization_layer.adapt(train_text)
        
        return vectorization_layer

In [13]:
texts = train_df["text"].to_list()
vectorizer = buildVectorizationLayer(texts=texts,df =train_df)


In [14]:
vectorizer("test")

<tf.Tensor: shape=(250,), dtype=int64, numpy=
array([616,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  

In [76]:
config = vectorizer.get_config()

In [125]:
dummy_model = tf.keras.models.Sequential([vectorizer])

In [126]:

dummy_model(tf.constant(["test"], dtype=tf.string))

<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[717,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   

In [127]:
dummy_model.summary()
dummy_model.compile()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  (None, 250)              0         
 ectorization)                                                   
                                                                 
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [116]:
dummy_model(tf.constant(["test"], dtype=tf.string))

<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[717,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   

In [79]:
dummy_model.save_weights("vectorization_layer_weights1")

In [128]:
dummy_model.save("model_save")

INFO:tensorflow:Assets written to: model_save\assets


In [129]:
del dummy_model
del vectorizer
gc.collect()

8557

In [87]:
reconstruct_vectorizer = TextVectorization.from_config(config=config)

In [88]:
reconstruct_vectorizer.build((1,))

In [89]:
reconstruct_model = tf.keras.models.Sequential([reconstruct_vectorizer])

In [90]:
reconstruct_model.load_weights("vectorization_layer_weights1")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x23efcc50c10>

In [91]:
reconstruct_model("testing")



FailedPreconditionError: Exception encountered when calling layer "string_lookup_9" "                 f"(type StringLookup).

{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2]

Call arguments received by layer "string_lookup_9" "                 f"(type StringLookup):
  • inputs=tf.Tensor(shape=(1,), dtype=string)

In [130]:
reloaded_model = tf.keras.models.load_model("model_save")

In [135]:
result = reloaded_model(tf.constant("test",dtype=tf.string))

In [136]:
result.shape

TensorShape([250])