### Classification based on text (no numbers)
**Task**: auto-assigning a product category based on the product's name  (via `TF-IDF Vectorizer`)

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

df = pd.read_csv('data/devices-products-small.csv')

df.shape

(87, 2)

In [32]:
df['Category'].value_counts()

Category
computers    36
tablets      29
phones       22
Name: count, dtype: int64

In [33]:
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y)

# vectorize to convert strings in 'valuable numbers'
tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)


clf = RandomForestClassifier()
clf.fit(tfidf_train_vectors, y_train)

y_pred = clf.predict(tfidf_test_vectors)

y_pred

array(['tablets', 'computers', 'computers', 'phones', 'computers',
       'computers', 'computers', 'tablets', 'computers', 'tablets',
       'tablets', 'computers', 'computers', 'computers', 'tablets',
       'computers', 'phones', 'computers', 'computers', 'tablets',
       'phones', 'tablets'], dtype=object)

In [34]:
df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_pred,
        'real_category': y_test
    },
    columns=['product', 'predicted_category', 'real_category']
)

df_compare

Unnamed: 0,product,predicted_category,real_category
0,"iPad Pro (12.9-inch, 1st generation)",tablets,tablets
1,"MacBook Pro (Retina, 13-inch, Mid 2014)",computers,computers
2,MacBook Pro (Mid 2010),computers,computers
3,Samsung Galaxy S21 Ultra (2021),phones,phones
4,"MacBook Pro (16-inch, 2019)",computers,computers
5,MacBook Pro (Mid 2009),computers,computers
6,"MacBook Pro (13-inch, Late 2016, Two Thunderbo...",computers,computers
7,iPad 2,tablets,tablets
8,"MacBook Pro (Retina, 15-inch, Late 2013)",computers,computers
9,iPad Mini (5th generation),tablets,tablets


In [35]:
accuracy_score(y_test, y_pred)

1.0