# Natural Language Processing Challenge

## Model: Embeddings with OpenAI API

In [2]:
#import sys
#sys.path.append('/content/sample_data')

In [3]:
# import here
import importlib
import helper_data as myhelp
importlib.reload(myhelp)

import pandas as pd
import nltk


In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

## Load the data

The data will be loaded from a .csv file. The columns are separated with tabs.

In [5]:
# load training data from file
#data = myhelp.load_data("../dataset/training_data_lowercase.csv", '\t')
data = myhelp.load_data("training_data_lowercase.csv", '\t')

# load testing data from file
#data_out = myhelp.load_data("../dataset/testing_data_lowercase_nolabels.csv")
data_out = myhelp.load_data("testing_data_lowercase_nolabels.csv")

## Initial visualization of the data

Let's get familiar with the data by starting to visualize the content.

In [6]:
myhelp.display_data(data)

Data head():
   0                                                  1
0  0  donald trump sends out embarrassing new year‚s...
1  0  drunk bragging trump staffer started russian c...
2  0  sheriff david clarke becomes an internet joke ...
3  0  trump is so obsessed he even has obama‚s name ...
4  0  pope francis just called out donald trump duri...

Data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       34152 non-null  int64 
 1   1       34152 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.8+ KB
None

Data shape:
(34152, 2)


In [7]:
myhelp.display_data(data_out)

Data head():
   0                                                  1
0  2  copycat muslim terrorist arrested with assault...
1  2  wow! chicago protester caught on camera admits...
2  2   germany's fdp look to fill schaeuble's big shoes
4  2  u.n. seeks 'massive' aid boost amid rohingya '...

Data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       9984 non-null   object
 1   1       9984 non-null   object
dtypes: object(2)
memory usage: 156.1+ KB
None

Data shape:
(9984, 2)


## Set names for the columns

In [8]:
data.columns = ['label', 'article']
data_out.columns = ['label', 'article']

## Data preprocessing: done by OpenAI embedd
ings

In [9]:
#!ls -a

from dotenv import load_dotenv
import os

from openai import OpenAI
import numpy as np

# load env variables
load_dotenv()

# read the OPEN AI key
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

def embed_batch(texts, batch_size=100):
  all_embeddings = []
  for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=batch
        )
    batch_emb = [item.embedding for item in response.data]
    all_embeddings.extend(batch_emb)

  return np.array(all_embeddings)


## Split the train dataset

In [10]:
from sklearn.model_selection import train_test_split

X_train_text, X_val_text, y_train, y_val = train_test_split(
    data['article'],
    data['label'].values,
    test_size=0.1,
    random_state=42
    )

In [11]:
X_train = embed_batch(X_train_text.tolist(), batch_size=100)
X_val = embed_batch(X_val_text.tolist(), batch_size=100)
X_test = embed_batch(data_out['article'].tolist(), batch_size=100)

## Train a RandomForest model

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# create the model
reg = RandomForestClassifier(n_estimators=300, random_state=100)

# fit the model with the training set
reg.fit(X_train, y_train)

# evaluate the model with validation set
y_val_pred = reg.predict(X_val)

# display accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print("Accuracy:", accuracy)

print(classification_report(y_val, y_val_pred))

Accuracy: 0.9484777517564403
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1733
           1       0.94      0.95      0.95      1683

    accuracy                           0.95      3416
   macro avg       0.95      0.95      0.95      3416
weighted avg       0.95      0.95      0.95      3416



In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Evaluate the model using test dataset

In [14]:
# predict for test data set
y_test_pred = reg.predict(X_test)

data_out['label'] = y_test_pred


## Save the predictions to an output file

In [15]:
data_out[['label', 'article']].to_csv('testing_data_lowercase_labels.csv', sep='\t', index=False, header=False)
