### Text classification bigger .csv (but ~90% accuracy score)


In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('data/devices-products.csv')

df.head()

Unnamed: 0,Product,Category
0,Apple 13-inch MacBook Air (M1 CPU) 256GB - 2020,Laptop
1,Apple 13-inch MacBook Air (M1 CPU) 512GB - 2020,Laptop
2,Apple 13-inch MacBook Air with Retina display ...,Laptop
3,Apple 13-inch MacBook Air with Retina display ...,Laptop
4,Apple 13-inch MacBook Pro (1.4GHz quad-core CP...,Laptop


In [16]:
df.shape

(1226, 2)

In [17]:
df['Category'].value_counts()

Category
Laptop            452
Monitor           296
Desktop           259
Server             55
Smartphone         50
IoT                30
Tablet             22
Thin Client        16
Printer            11
Hard drive         11
Gaming              5
Workstation         4
Multimedia          4
Network             4
Entertainment       2
Converged Edge      2
Converged           2
SAN/NAS             1
Name: count, dtype: int64

In [18]:
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# vectorize
tfidf_vectorizer = TfidfVectorizer()
tfidf_x_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)

# fit model and predict
clf_random_forest = RandomForestClassifier()
clf_random_forest.fit(tfidf_x_train_vectors, y_train)
y_pred = clf_random_forest.predict(tfidf_test_vectors)


In [19]:
df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_pred,
        'real_category': y_test
    },
    columns=['product', 'predicted_category', 'real_category']
)

df_compare

Unnamed: 0,product,predicted_category,real_category
0,Lenovo ThinkPad T440s,Laptop,Laptop
1,Apple 13-inch MacBook Pro (2.0GHz quad-core CP...,Laptop,Laptop
2,Lenovo IdeaPad Slim 7 15/Yoga Slim 7 15,Laptop,Laptop
3,Dell E2220H Monitor,Monitor,Monitor
4,Lexmark CX924dxe,Printer,Printer
...,...,...,...
241,Dell PowerEdge T130,Server,Server
242,Apple 14-inch MacBook Pro with 64GB,Laptop,Laptop
243,Seagate Makara HDD 8TB,Hard drive,Hard drive
244,HP E24i G4 HO,Monitor,Monitor


In [20]:
accuracy_score(y_test, y_pred)

0.9146341463414634

### Comparing accuracy between differents algorithms

In [22]:
# Importing libraries for other algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [25]:
df = pd.read_csv('data/devices-products.csv')
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

tfidf_vectorizer = TfidfVectorizer()
tfidf_x_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_x_test_vectors = tfidf_vectorizer.transform(x_test)

# 1st algorithm
clf_random_forest = RandomForestClassifier()
clf_random_forest.fit(tfidf_x_train_vectors, y_train)
y_random_forest_pred = clf_random_forest.predict(tfidf_x_test_vectors)

accuracy_score(y_test, y_random_forest_pred)



0.9146341463414634

In [27]:
# 2nd algorithm
clf_knn = KNeighborsClassifier(n_neighbors=19)
clf_knn.fit(tfidf_x_train_vectors, y_train)
y_knn_pred = clf_knn.predict(tfidf_x_test_vectors)
accuracy_score(y_test, y_knn_pred)

0.8577235772357723

In [28]:
# 3rd algotithm
clf_nb = MultinomialNB()
clf_nb.fit(tfidf_x_train_vectors, y_train)
y_nb_pred = clf_nb.predict(tfidf_x_test_vectors)
accuracy_score(y_test, y_nb_pred)

0.8536585365853658

In [29]:
# 4th algorithm
clf_svc = LinearSVC(dual=True)
clf_svc.fit(tfidf_x_train_vectors, y_train)
y_svc_pred = clf_svc.predict(tfidf_x_test_vectors)
accuracy_score(y_test, y_svc_pred)

0.967479674796748

In [30]:
# 5th algorithm
clf_logreg = LogisticRegression()
clf_logreg.fit(tfidf_x_train_vectors, y_train)
y_logreg_pred = clf_logreg.predict(tfidf_x_test_vectors)
accuracy_score(y_test, y_logreg_pred)

0.9146341463414634

In [31]:
# the best algorithm by accuracy is..
{'random_forest': 0.9146341463414634,
 'k_nearest_neighbors': 0.8577235772357723,
 'naive_baynes': 0.8536585365853658,
 'support_vector_machines': 0.967479674796748,
 'logistic_regression': 0.9146341463414634}


{'random_forest': 0.9146341463414634,
 'k_nearest_neighbors': 0.8577235772357723,
 'naive_baynes': 0.8536585365853658,
 'support_vector_machines': 0.967479674796748,
 'logistic_regression': 0.9146341463414634}

### Compare Algorithm Speed Performance

In [33]:
# todo 


