# TF-IDF vectorizer

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["They're playing the piano while flying in the plane",
 'The team members were hard to tell apart since they all wore their hair in a ponytail',
 'He put heat on the wound to see what would grow',
 'Pair your designer cowboy hat with scuba gear for a memorable occasion',
 'He appeared to be confusingly perplexed',
 'I caught my squirrel rustling through my gym bag',
 "The body piercing didn't go exactly as he expected",
 '']

In [10]:
vector = TfidfVectorizer()
vector.fit_transform(corpus)
print(vector.vocabulary_)

{'they': 49, 're': 39, 'playing': 36, 'the': 47, 'piano': 33, 'while': 54, 'flying': 14, 'in': 25, 'plane': 35, 'team': 45, 'members': 26, 'were': 52, 'hard': 21, 'to': 51, 'tell': 46, 'apart': 1, 'since': 43, 'all': 0, 'wore': 56, 'their': 48, 'hair': 20, 'ponytail': 37, 'he': 23, 'put': 38, 'heat': 24, 'on': 30, 'wound': 58, 'see': 42, 'what': 53, 'would': 57, 'grow': 18, 'pair': 31, 'your': 59, 'designer': 10, 'cowboy': 9, 'hat': 22, 'with': 55, 'scuba': 41, 'gear': 16, 'for': 15, 'memorable': 27, 'occasion': 29, 'appeared': 2, 'be': 5, 'confusingly': 8, 'perplexed': 32, 'caught': 7, 'my': 28, 'squirrel': 44, 'rustling': 40, 'through': 50, 'gym': 19, 'bag': 4, 'body': 6, 'piercing': 34, 'didn': 11, 'go': 17, 'exactly': 12, 'as': 3, 'expected': 13}


In [12]:
vector.get_feature_names_out()

array(['all', 'apart', 'appeared', 'as', 'bag', 'be', 'body', 'caught',
       'confusingly', 'cowboy', 'designer', 'didn', 'exactly', 'expected',
       'flying', 'for', 'gear', 'go', 'grow', 'gym', 'hair', 'hard',
       'hat', 'he', 'heat', 'in', 'members', 'memorable', 'my',
       'occasion', 'on', 'pair', 'perplexed', 'piano', 'piercing',
       'plane', 'playing', 'ponytail', 'put', 're', 'rustling', 'scuba',
       'see', 'since', 'squirrel', 'team', 'tell', 'the', 'their', 'they',
       'through', 'to', 'were', 'what', 'while', 'with', 'wore', 'would',
       'wound', 'your'], dtype=object)

## Ecommerce category classification

In [48]:
import pandas as pd

In [110]:
df = pd.read_csv('ecommerce_data.csv')
print(df.shape)

(50424, 2)


In [111]:
df.iloc[:,0].value_counts()

Household                 19312
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: Household, dtype: int64

In [112]:
df['label'] = df.iloc[:,0].values
df['text'] = df.iloc[:,1].values

In [113]:
from sklearn.preprocessing import LabelEncoder

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [115]:
encoder = LabelEncoder()
df['label_num'] = encoder.fit_transform(df.iloc[:,2])

In [116]:
df = df.iloc[:,2:]
df = df.dropna(subset=['text'])
x = df['text']
y = df['label_num']

In [117]:
x.head(2)

0    SAF 'Floral' Framed Painting (Wood, 30 inch x ...
1    SAF 'UV Textured Modern Art Print Framed' Pain...
Name: text, dtype: object

In [118]:
y.head(2)

0    3
1    3
Name: label_num, dtype: int64

In [119]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0,stratify=df['label_num'])

In [120]:
print(y_train.value_counts())

3    15449
0     9456
2     8497
1     6936
Name: label_num, dtype: int64


In [121]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [122]:
pipe = Pipeline([
    ('tf-idf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

In [123]:
print(y_train)
print(x_train)

21147    0
3306     3
21476    0
3956     3
34013    1
        ..
45116    2
12592    3
7689     3
10184    3
9944     3
Name: label_num, Length: 40338, dtype: int64
21147    Inside of a Dog -- Young Readers Edition: What...
3306     Crompton ACGIH-IHL102 1000-Watt Immersion Wate...
21476    30 Practice Sets IBPS-VIII Bank Clerk Prelimin...
3956     Ajanta Plastic Wall Clock (20.5 cm x 20.5 cm x...
34013    KAEZRI Men's Genuine Leather Black Belt-2 Year...
                               ...                        
45116    iVoltaa 3.5mm Braided Aux (Auxiliary) Audio Ca...
12592    Black + Decker BXCM0401IN 4-Cup Espresso & Cap...
7689     CbeeSo 10 Racks Foldable Wardrobe CbeeSo, the ...
10184    InstaCuppa Vacuum Insulated Coffee, Tea Travel...
9944             Novicz Plastic 4-Layer Cutlery Rack, Blue
Name: text, Length: 40338, dtype: object


In [None]:
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
classification_report(y_test,y_pred)