### Classification based on text (no numbers)
**Task**: auto-assigning a product category based on the product's name  (via `TF-IDF Vectorizer`)

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

df = pd.read_csv('data/devices-products-small.csv')

df.shape

(87, 2)

In [39]:
df['Category'].value_counts()

Category
computers    36
tablets      29
phones       22
Name: count, dtype: int64

In [40]:
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y)

# vectorize to convert strings in 'valuable numbers'
tfidf_vectorizer = TfidfVectorizer()
tfidf_x_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)


clf = RandomForestClassifier()
clf.fit(tfidf_x_train_vectors, y_train)

y_pred = clf.predict(tfidf_test_vectors)

y_pred

array(['tablets', 'tablets', 'tablets', 'tablets', 'tablets', 'computers',
       'phones', 'phones', 'tablets', 'computers', 'computers', 'tablets',
       'computers', 'phones', 'computers', 'tablets', 'tablets', 'phones',
       'phones', 'computers', 'computers', 'phones'], dtype=object)

In [41]:
df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_pred,
        'real_category': y_test
    },
    columns=['product', 'predicted_category', 'real_category']
)

df_compare

Unnamed: 0,product,predicted_category,real_category
0,iPad Air 2,tablets,tablets
1,iPad Air,tablets,tablets
2,iPad Mini 2,tablets,tablets
3,"iPad Pro (11-inch, 2nd generation)",tablets,tablets
4,iPad (4th generation),tablets,tablets
5,MacBook Pro (2006),computers,computers
6,Samsung Galaxy S5 (2014),phones,phones
7,Samsung Galaxy S21 (2021),phones,phones
8,"iPad Pro (12.9-inch, 3rd generation)",tablets,tablets
9,MacBook Pro (Early 2008),computers,computers


In [42]:
accuracy_score(y_test, y_pred)

1.0

In [49]:
future_x_test = tfidf_vectorizer.transform(['Samsung Galaxy S39', 'iPad Future'])

future_y_pred = clf.predict(future_x_test)
future_y_pred

array(['phones', 'tablets'], dtype=object)