In [82]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
corp = ["mind the protein sequence information but also discerns",
        "architecture together with a continuous bag-of-words (CBOW) model based on generated metapaths to study the PPI predictions",
        "large amounts of data for constructing primary",
        "Moreover, compared with the significant amount",
        "traditional machine learning algorithms such as",
        "these traditional machine learning techniques lack the capacity of discovering hidden associations and extracting",
        "In the unsupervised learning phase, we integrate the multi-modality features learned from Continuous Bag ofWord (CBOW)",
        "which can be divided into three phases including a protein sequence preprocessing phase, an unsupervised learning phase, and a supervised learning phase"]

In [3]:
TF_IDF_V = TfidfVectorizer()
vectorized_text = TF_IDF_V.fit_transform(corp)
print(vectorized_text)

  (0, 20)	0.3857648519373807
  (0, 1)	0.3857648519373807
  (0, 12)	0.3857648519373807
  (0, 31)	0.3857648519373807
  (0, 55)	0.3233010528565351
  (0, 54)	0.3233010528565351
  (0, 61)	0.21651848302712087
  (0, 40)	0.3857648519373807
  (1, 51)	0.2528000077905801
  (1, 50)	0.2528000077905801
  (1, 57)	0.2528000077905801
  (1, 64)	0.2528000077905801
  (1, 39)	0.2528000077905801
  (1, 27)	0.2528000077905801
  (1, 47)	0.2528000077905801
  (1, 10)	0.2528000077905801
  (1, 42)	0.2528000077905801
  (1, 15)	0.21186613624950398
  (1, 71)	0.2528000077905801
  (1, 45)	0.18282309219238047
  (1, 9)	0.21186613624950398
  (1, 18)	0.21186613624950398
  (1, 70)	0.21186613624950398
  (1, 65)	0.2528000077905801
  (1, 6)	0.2528000077905801
  :	:
  (6, 48)	0.2226064654176866
  (6, 67)	0.2226064654176866
  (6, 29)	0.26561543618066724
  (6, 37)	0.16842157039665318
  (6, 15)	0.2226064654176866
  (6, 9)	0.2226064654176866
  (6, 18)	0.2226064654176866
  (6, 61)	0.29816428853793314
  (7, 59)	0.2144762319783397
  (

In [4]:
print(TF_IDF_V.vocabulary_)

{'mind': 40, 'the': 61, 'protein': 54, 'sequence': 55, 'information': 31, 'but': 12, 'also': 1, 'discerns': 20, 'architecture': 6, 'together': 65, 'with': 70, 'continuous': 18, 'bag': 9, 'of': 45, 'words': 71, 'cbow': 15, 'model': 42, 'based': 10, 'on': 47, 'generated': 27, 'metapaths': 39, 'to': 64, 'study': 57, 'ppi': 50, 'predictions': 51, 'large': 35, 'amounts': 3, 'data': 19, 'for': 25, 'constructing': 17, 'primary': 53, 'moreover': 43, 'compared': 16, 'significant': 56, 'amount': 2, 'traditional': 66, 'machine': 38, 'learning': 37, 'algorithms': 0, 'such': 58, 'as': 7, 'these': 62, 'techniques': 60, 'lack': 34, 'capacity': 14, 'discovering': 21, 'hidden': 28, 'associations': 8, 'and': 5, 'extracting': 23, 'in': 29, 'unsupervised': 67, 'phase': 48, 'we': 68, 'integrate': 32, 'multi': 44, 'modality': 41, 'features': 24, 'learned': 36, 'from': 26, 'ofword': 46, 'which': 69, 'can': 13, 'be': 11, 'divided': 22, 'into': 33, 'three': 63, 'phases': 49, 'including': 30, 'preprocessing': 5

In [5]:
all_feature_names = TF_IDF_V.get_feature_names_out()
all_feature_names

array(['algorithms', 'also', 'amount', 'amounts', 'an', 'and',
       'architecture', 'as', 'associations', 'bag', 'based', 'be', 'but',
       'can', 'capacity', 'cbow', 'compared', 'constructing',
       'continuous', 'data', 'discerns', 'discovering', 'divided',
       'extracting', 'features', 'for', 'from', 'generated', 'hidden',
       'in', 'including', 'information', 'integrate', 'into', 'lack',
       'large', 'learned', 'learning', 'machine', 'metapaths', 'mind',
       'modality', 'model', 'moreover', 'multi', 'of', 'ofword', 'on',
       'phase', 'phases', 'ppi', 'predictions', 'preprocessing',
       'primary', 'protein', 'sequence', 'significant', 'study', 'such',
       'supervised', 'techniques', 'the', 'these', 'three', 'to',
       'together', 'traditional', 'unsupervised', 'we', 'which', 'with',
       'words'], dtype=object)

In [6]:
for word in all_feature_names:
  index = TF_IDF_V.vocabulary_.get(word)
  List = TF_IDF_V.idf_[index]
  print(List, " | ", word)

2.504077396776274  |  algorithms
2.504077396776274  |  also
2.504077396776274  |  amount
2.504077396776274  |  amounts
2.504077396776274  |  an
2.09861228866811  |  and
2.504077396776274  |  architecture
2.504077396776274  |  as
2.504077396776274  |  associations
2.09861228866811  |  bag
2.504077396776274  |  based
2.504077396776274  |  be
2.504077396776274  |  but
2.504077396776274  |  can
2.504077396776274  |  capacity
2.09861228866811  |  cbow
2.504077396776274  |  compared
2.504077396776274  |  constructing
2.09861228866811  |  continuous
2.504077396776274  |  data
2.504077396776274  |  discerns
2.504077396776274  |  discovering
2.504077396776274  |  divided
2.504077396776274  |  extracting
2.504077396776274  |  features
2.504077396776274  |  for
2.504077396776274  |  from
2.504077396776274  |  generated
2.504077396776274  |  hidden
2.504077396776274  |  in
2.504077396776274  |  including
2.504077396776274  |  information
2.504077396776274  |  integrate
2.504077396776274  |  into
2

In [7]:
List

2.504077396776274

In [6]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [7]:
! kaggle datasets download saurabhshahane/ecommerce-text-classification

Downloading ecommerce-text-classification.zip to /content
  0% 0.00/7.86M [00:00<?, ?B/s] 64% 5.00M/7.86M [00:00<00:00, 40.8MB/s]
100% 7.86M/7.86M [00:00<00:00, 59.4MB/s]


In [8]:
! unzip ecommerce-text-classification

Archive:  ecommerce-text-classification.zip
  inflating: ecommerceDataset.csv    


In [9]:
df = pd.read_csv('/content/ecommerceDataset.csv')
print(df.shape)
df.head()

(50424, 2)


Unnamed: 0,Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room."
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [10]:
df.Household.value_counts()

Household                 19312
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: Household, dtype: int64

In [23]:
df_Household=df[0:19311]
df_Books=df[19312:31131]
df_Electronics=df[31132:41752]
df_Clothing=df[41753:50424]

In [33]:
Min = min(len(df_Clothing), len(df_Electronics), len(df_Books), len(df_Household))
Min

8671

In [49]:
df_H = df_Household.sample(Min)
df_B = df_Books.sample(Min)
df_E = df_Electronics.sample(Min)
df_C = df_Clothing.sample(Min)
balanced_df = pd.concat([df_H, df_B, df_E, df_C], axis=0)

In [50]:
balanced_df.Household.value_counts()

Electronics               10268
Household                  8671
Books                      8671
Clothing & Accessories     7074
Name: Household, dtype: int64

In [52]:
balanced_df['label_num'] = balanced_df.Household.map({
    'Household' : 0,
    'Books' : 1,
    'Electronics' : 2,
    'Clothing & Accessories' : 3,
})
balanced_df

Unnamed: 0,Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room.",label_num
14974,Household,Whirlpool 265 L 3 Star Frost Free Double Door ...,0
13321,Household,"Cuizen CST-1412B Retro Hot Dog Steamer, Red CS...",0
18419,Household,RUATAM HANDICRAFTS Outdoor Knife with Cover fo...,0
13360,Household,Prestige PIC 20 1200 Watt Induction Cooktop wi...,0
4445,Household,Xtore 12pcs 3D Metallic Finish Home Decor Butt...,0
...,...,...,...
45116,Electronics,iVoltaa 3.5mm Braided Aux (Auxiliary) Audio Ca...,2
42251,Electronics,Casio MJ-120D Electronic Calculator Many diffe...,2
43584,Electronics,"HP 18.5 inch (46.9 cm) LED Monitor - HD, TN Pa...",2
46104,Electronics,Fuji Instax Mini 8 Value Cam Instant Camera - ...,2


In [53]:
df1 = balanced_df.sample(frac = 1)
df1

Unnamed: 0,Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room.",label_num
23219,Books,The Stranger: Includes an Interview with Exist...,1
27019,Books,Design Patterns CD: Elements of Reusable Objec...,1
36914,Clothing & Accessories,Mpitude Coral Extreme Micro Bikini Set Lingeri...,3
40588,Electronics,FARRAIGE 3 Port USB 2.0 Hub + All in One Combo...,2
32185,Clothing & Accessories,Kiddeo Girl's Cotton T-Shirts - Pack of 5 Kids...,3
...,...,...,...
26030,Books,Cost Accounting and Management Essentials You ...,1
2817,Household,Spacecrafts Wooden Folding Computer Table Mate...,0
32338,Clothing & Accessories,BODYCARE Pure Cotton Multi-Coloured Brief for ...,3
40592,Electronics,Cables Cloud™ 4-Port USB Stand Organizer Charg...,2


In [64]:
df1=df1.rename(columns={"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room.":"text"})

In [65]:
df1

Unnamed: 0,Household,text,label_num
23219,Books,The Stranger: Includes an Interview with Exist...,1
27019,Books,Design Patterns CD: Elements of Reusable Objec...,1
36914,Clothing & Accessories,Mpitude Coral Extreme Micro Bikini Set Lingeri...,3
40588,Electronics,FARRAIGE 3 Port USB 2.0 Hub + All in One Combo...,2
32185,Clothing & Accessories,Kiddeo Girl's Cotton T-Shirts - Pack of 5 Kids...,3
...,...,...,...
26030,Books,Cost Accounting and Management Essentials You ...,1
2817,Household,Spacecrafts Wooden Folding Computer Table Mate...,0
32338,Clothing & Accessories,BODYCARE Pure Cotton Multi-Coloured Brief for ...,3
40592,Electronics,Cables Cloud™ 4-Port USB Stand Organizer Charg...,2


In [66]:
x_train, x_test, y_train, y_test = train_test_split(df1.text, df1.label_num, test_size=0.2, random_state=1000, stratify=df1.label_num)

In [73]:
knn = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier()),
])

In [78]:
knn.fit(x_train, y_train)

Pipeline(steps=[('vectorizer_tfidf', TfidfVectorizer()),
                ('KNN', KNeighborsClassifier())])

In [79]:
y_pred = knn.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1734
           1       0.97      0.95      0.96      1734
           2       0.96      0.96      0.96      2054
           3       0.97      0.98      0.97      1415

    accuracy                           0.96      6937
   macro avg       0.96      0.96      0.96      6937
weighted avg       0.96      0.96      0.96      6937



In [83]:
RF = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Random Forest',RandomForestClassifier()),
])

In [84]:
RF.fit(x_train,y_train)

Pipeline(steps=[('vectorizer_tfidf', TfidfVectorizer()),
                ('Random Forest', RandomForestClassifier())])

In [85]:
y_pred = RF.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1734
           1       0.96      0.97      0.96      1734
           2       0.98      0.96      0.97      2054
           3       0.98      0.97      0.98      1415

    accuracy                           0.96      6937
   macro avg       0.96      0.96      0.96      6937
weighted avg       0.96      0.96      0.96      6937

