In [2]:
%pip install --upgrade numexpr

Collecting numexpr
  Downloading numexpr-2.11.0.tar.gz (108 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: numexpr
  Building wheel for numexpr (pyproject.toml) ... [?25ldone
[?25h  Created wheel for numexpr: filename=numexpr-2.11.0-cp310-cp310-linux_x86_64.whl size=149340 sha256=647b1c27706dc2bfc16dd0aafac67a0b2d4314c37779867e898f72e8f7d3152c
  Stored in directory: /home/ec2-user/.cache/pip/wheels/a7/d0/17/e38daa1110f54ba5f7330d38440f592c063251a6456053e2ed
Successfully built numexpr
Installing collected packages: numexpr
  Attempting uninstall: numexpr
    Found existing installation: numexpr 2.7.3
    Uninstalling numexpr-2.7.3:
      Successfully uninstalled numexpr-2.7.3
Successfully installed numexpr-2.11.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session()
role = get_execution_role()
bucket_name = 'api-cdv-project'

df_fake = pd.read_parquet(f's3://{bucket_name}/data/processed/fake-news.snappy.parquet')
df_true = pd.read_parquet(f's3://{bucket_name}/data/processed/true-news.snappy.parquet')

df_fake['label'] = 'fake'
df_true['label'] = 'true'

df = pd.concat([df_fake, df_true], ignore_index=True)
df['bt_line'] = "__label__" + df['label'] + " " + df['text']

train_file = 'news-train.txt'
df.bt_line.to_csv(train_file, index=False, header=False)

session.upload_data(train_file, bucket=bucket_name, key_prefix='data/input')
print("Training data saved in S3")

Training data saved in S3


In [13]:
from collections import defaultdict
import random

def balance_data(input_file, output_file):
    data = []
    with open(input_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data.append(line)

    class_data = defaultdict(list)
    for line in data:
        parts = line.split()
        if len(parts) == 0:
            continue
        label = parts[0]
        class_data[label].append(line)

    max_len = max(len(v) for v in class_data.values())

    balanced_data = []
    for label, lines in class_data.items():
        multiplier = max_len // len(lines)
        remainder = max_len % len(lines)
        balanced_data.extend(lines * multiplier)
        balanced_data.extend(random.sample(lines, remainder))

    random.shuffle(balanced_data)

    with open(output_file, "w") as f:
        for line in balanced_data:
            f.write(line + "\n")


balanced_train_file = "news-train-balanced.txt"
balance_data(train_file, balanced_train_file)
session.upload_data(balanced_train_file, bucket=bucket_name, key_prefix='data/input')
print("Balanced training data saved in S3")

Balanced training data saved in S3


In [10]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

bt_container = sagemaker.image_uris.retrieve(region=session.boto_region_name, framework='blazingtext')

bt = Estimator(
    image_uri=bt_container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size=30,
    max_run=3600,
    input_mode='File',
    output_path=f's3://{bucket_name}/data/output',
    sagemaker_session=session
)

bt.set_hyperparameters(
    mode='supervised',
    epochs=50,
    min_count=2,
    learning_rate=0.02,
    vector_dim=100,
    word_ngrams=2
)

balanced_train_file = "news-train-balanced.txt"
train_input = TrainingInput(f's3://{bucket_name}/data/input/{balanced_train_file}', content_type='text/plain')

bt.fit({'train': train_input})


bt_model = bt.create_model()

predictor = bt_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name='check-news-4'
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: blazingtext-2025-07-07-20-04-32-922


2025-07-07 20:04:34 Starting - Starting the training job...
2025-07-07 20:04:49 Starting - Preparing the instances for training...
2025-07-07 20:05:10 Downloading - Downloading input data...
2025-07-07 20:06:06 Downloading - Downloading the training image...
2025-07-07 20:06:11 Training - Training image download completed. Training in progress...[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[07/07/2025 20:06:16 INFO 140129556813632] nvidia-smi took: 0.0252230167388916 secs to identify 0 gpus[0m
[34m[07/07/2025 20:06:16 INFO 140129556813632] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[07/07/2025 20:06:16 INFO 140129556813632] Processing /opt/ml/input/data/train/news-train-balanced.txt . File size: 206.85100078582764 MB[0m
[34mRead 10M words[0m
[34mRead 20M words[0m
[34mRead 30M words[0m
[34mRead 35M words[0m
[34mNumber of words:  195218[0m
[34m##

INFO:sagemaker:Creating model with name: blazingtext-2025-07-07-20-10-53-289


Training seconds: 325
Billable seconds: 325


INFO:sagemaker:Creating endpoint-config with name check-news-4
INFO:sagemaker:Creating endpoint with name check-news-4


-----!

In [18]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

title = "Breaking News"
text = "Barack Obama won president"
input_text = f"{title} {text}"


response = predictor.predict({"instances": [input_text]})
print(response)

[{'label': ['__label__fake'], 'prob': [1.0000100135803223]}]
