In [31]:
import openai
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import os

df = pd.read_csv("./posts.csv")

openai.api_key = os.environ["OPENAI_API_KEY"]

# convert all Content to string
df["content"] = df["content"].astype(str)

# get first 10 rows
# df = df.head(50)

df["embedding"] = df["content"].apply(lambda x: openai.Embedding.create(
    input="".join(x.split("\n")[1:8]),
    model="text-embedding-ada-002"
)["data"][0]["embedding"])

In [45]:
# 20% test set
# shuffle data

df = df.sample(frac=1).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(list(df["embedding"].values), df["category"], test_size=0.2)

In [47]:
model = RandomForestClassifier(n_estimators=100)

model.fit(list(X_train), y_train)

prediction = model.predict(list(X_test))

# get accuracy
print(model.score(list(X_test), y_test))

for i in range(len(prediction)):
    print(f"Predicted: {prediction[i]}, Actual: {y_test.iloc[i]}")

0.6842105263157895
Predicted: IndieWeb, Actual: Thermal Printer (Series)
Predicted: IndieWeb, Actual: IndieWeb
Predicted: IndieWeb, Actual: Blog Search Engine (Series)
Predicted: Coffee, Actual: Coffee
Predicted: Interviews, Actual: Interviews
Predicted: IndieWeb, Actual: IndieWeb
Predicted: IndieWeb, Actual: IndieWeb
Predicted: Interviews, Actual: Interviews
Predicted: Coffee, Actual: Coffee
Predicted: IndieWeb, Actual: IndieWeb
Predicted: Life, Actual: Life
Predicted: IndieWeb, Actual: Blog Search Engine (Series)
Predicted: Coffee, Actual: Coffee
Predicted: IndieWeb, Actual: IndieWeb
Predicted: Coffee, Actual: Coffee
Predicted: IndieWeb, Actual: Advent of Bloggers (Series)
Predicted: Coffee, Actual: Book Review
Predicted: Coffee, Actual: Coffee
Predicted: Coffee, Actual: Coffee
Predicted: Coffee, Actual: Coffee
Predicted: Coffee, Actual: Coffee
Predicted: Coffee, Actual: Coffee
Predicted: IndieWeb, Actual: IndieWeb
Predicted: Interviews, Actual: Coffee Interview
Predicted: Coffee, Ac

In [49]:
import eli5
import numpy as np

eli5.show_weights(model)

# show tree using explain prediction
# eli5.show_prediction(model, X_test[0], feature_names=list(df["embedding"].values))

Weight,Feature
0.0036  ± 0.0497,x4647
0.0036  ± 0.0345,x12097
0.0032  ± 0.0318,x6440
0.0028  ± 0.0301,x11010
0.0027  ± 0.0321,x164
0.0026  ± 0.0323,x11774
0.0025  ± 0.0344,x8367
0.0024  ± 0.0303,x12050
0.0024  ± 0.0327,x11732
0.0024  ± 0.0268,x96


In [110]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

vector = CountVectorizer()
regression = LogisticRegression()

df = pd.read_csv("./posts.csv")

df.head()

# remove null values
df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(df["content"], df["category"], test_size=0.2)

pipe = make_pipeline(vector, regression)

vectorizer = pipe.named_steps['countvectorizer']

pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [114]:
from sklearn import metrics

pipe.score(X_test, y_test)

categories = y_test
predictions = pipe.predict(X_test)

report = metrics.classification_report(y_test, predictions)

print(report)

                                  precision    recall  f1-score   support

     Advent of Bloggers (Series)       1.00      1.00      1.00         8
     Blog Search Engine (Series)       1.00      0.33      0.50         3
                     Board Games       0.00      0.00      0.00         1
                     Book Review       0.80      1.00      0.89         4
                          Coffee       0.78      1.00      0.88        21
                Coffee Resources       0.00      0.00      0.00         2
                   Coffee Review       1.00      0.67      0.80         6
                          Events       0.00      0.00      0.00         1
Guess the Scottish Cafe (Series)       1.00      1.00      1.00         1
                        IndieWeb       0.61      1.00      0.76        11
                      Interviews       1.00      1.00      1.00        10
                            Life       0.00      0.00      0.00         1
                          Photos     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
