In [26]:
import os
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import openai
import time

import dotenv

In [None]:
dotenv.load_dotenv()

# openai.api_key = os.getenv('OPENAI_API_KEY')

### Data Import

The source data for this project is located here:

http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz


In [9]:
data_path = './../data/raw/20news-bydate/20news-bydate-train'

In [12]:
# get list of category directories
categories = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

categories

['talk.politics.mideast',
 'rec.autos',
 'comp.sys.mac.hardware',
 'alt.atheism',
 'rec.sport.baseball',
 'comp.os.ms-windows.misc',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.med',
 'talk.politics.misc',
 'rec.motorcycles',
 'comp.windows.x',
 'comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'sci.electronics',
 'talk.politics.guns',
 'sci.space',
 'soc.religion.christian',
 'misc.forsale',
 'talk.religion.misc']

Create lists to store the texts and corresponding categories.

In [13]:
texts = []
labels = []

Loop through each category directory to read the text files and assign the appropriate label.

In [None]:
for category in categories:
    category_path = os.path.join(data_path, category)

    # import all text files in the current category folder
    file_paths = glob.glob(os.path.join(category_path, '*'))

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            texts.append(text)
            labels.append(category)


Create data frame

In [16]:
df = pd.DataFrame({'text': texts, 'category': labels})

In [17]:
print(f"Imported {len(df)} documents.")

Imported 11314 documents.


Serialize records

In [22]:
df = df.reset_index(drop=True)
df['id'] = df.index + 1

In [23]:
df.head()

Unnamed: 0,text,category,id
0,From: hm@cs.brown.edu (Harry Mamaysky)\nSubjec...,talk.politics.mideast,1
1,From: waldo@cybernet.cse.fau.edu (Todd J. Dick...,talk.politics.mideast,2
2,From: C.L.Gannon@newcastle.ac.uk (Space Cadet)...,talk.politics.mideast,3
3,From: shaig@Think.COM (Shai Guday)\nSubject: B...,talk.politics.mideast,4
4,From: koc@rize.ECE.ORST.EDU (Cetin Kaya Koc)\n...,talk.politics.mideast,5


Export data frame

In [21]:
df.to_csv('./../data/processed/train.csv')

Examining the first text file's structure

In [20]:
print(df['text'][0])

From: hm@cs.brown.edu (Harry Mamaysky)
Subject: Heil Hernlem 
In-Reply-To: hernlem@chess.ncsu.edu's message of Wed, 14 Apr 1993 12:58:13 GMT
Organization: Dept. of Computer Science, Brown University
Lines: 24

In article <1993Apr14.125813.21737@ncsu.edu> hernlem@chess.ncsu.edu (Brad Hernlem) writes:

   Lebanese resistance forces detonated a bomb under an Israeli occupation
   patrol in Lebanese territory two days ago. Three soldiers were killed and
   two wounded. In "retaliation", Israeli and Israeli-backed forces wounded
   8 civilians by bombarding several Lebanese villages. Ironically, the Israeli
   government justifies its occupation in Lebanon by claiming that it is 
   necessary to prevent such bombardments of Israeli villages!!

   Congratulations to the brave men of the Lebanese resistance! With every
   Israeli son that you place in the grave you are underlining the moral
   bankruptcy of Israel's occupation and drawing attention to the Israeli
   government's policy of rec

### Generate Embeddings

In [30]:
client = openai.OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

In [45]:
def get_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-large")

    return response.data[0].embedding

Add embeddings to data frame

In [None]:
embeddings = []

for idx, text in enumerate(df['text']):
    try:
        embedding = get_embedding(text)
    except Exception as e:
        print(f"Error generating embedding for index {idx}: {e}")
    
    embeddings.append(embedding)

    # if necessary ... short delay to avoid API rate limits
    # time.sleep(0.1)

Error generating embedding for index 67: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 15213 tokens (15213 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error generating embedding for index 266: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 13926 tokens (13926 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error generating embedding for index 514: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 10805 tokens (10805 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


In [None]:
df['embedding'] = embeddings

Export data frame as pickle file

In [None]:
df.to_pickle('./../data/processed/train_with_embeddings.pkl')