<a href="https://colab.research.google.com/github/bognevivien/text-classification/blob/main/classy_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies

In [1]:
!pip install classy-classification onnx --no-use-pep517 -q


[K     |████████████████████████████████| 13.1 MB 4.7 MB/s 
[K     |████████████████████████████████| 135 kB 80.0 MB/s 
[K     |████████████████████████████████| 85 kB 5.0 MB/s 
[K     |████████████████████████████████| 5.5 MB 66.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 75.9 MB/s 
[K     |████████████████████████████████| 182 kB 83.2 MB/s 
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
[K     |████████████████████████████████| 4.7 MB 71.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 80.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 83.8 MB/s 
[K     |████████████████████████████████| 17.0 MB 67.7 MB/s 
[K     |████████████████████████████████| 308 kB 92.7 MB/s 
[K     |████████████████████████████████| 548 kB 81.4 MB/s 
[K     |████████████████████████████████| 68 kB 8.3 MB/s 
[K     |████████████████████████████████| 128 kB 65.8 MB/s 
[K     |████████████████████████████████| 296 kB 71.2 MB/s 
[K     |████████████████████

In [2]:
!pip install python-dotenv -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0


# Importing librairies


In [12]:
import spacy
import classy_classification

from dotenv import dotenv_values

from google.colab import drive


# Importing environment variables

In [5]:
config = dotenv_values(".env")  

In [8]:
config['dbname']

'mvppreproddb'

# Mounting drive

In [13]:
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Nov 23 13:45:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    49W / 400W |    658MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


# Collecting the data

## From database

### pandas read_sql stream_results function

In [18]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm
tqdm.pandas()


def getting_data_from_table(tablename="product", chuncksize=100000):
    engine = create_engine(
        f"postgresql://{config['user']}:{config['pwd']}@{config['hostname']}/{config['dbname']}"
    )
    dfs = pd.DataFrame()
    # First Round trip to get total items for monitoring purpose
    total_items_df = pd.read_sql(
        "select count(*) from {} ".format(tablename), engine.connect())
    total_items = total_items_df['count'][0]
    total = int(total_items/chuncksize)+1

    conn = engine.connect().execution_options(stream_results=True)
    query = "SELECT * FROM {} ".format(tablename)

    i = 0
    for chunk_dataframe in tqdm(pd.read_sql(query, conn, chunksize=chuncksize), desc="getting {} records from {} as dataframe using {} chunks".format(total_items, tablename, total), total=total):
        # print(f"Got dataframe w/{len(chunk_dataframe)} rows")
        # ... do something with dataframe ...
        dfs = pd.concat([dfs, chunk_dataframe])
        # chunk_dataframe.to_pickle('../data/data_chunks/attributes_names/chunk_{}'.format(i))
        # i = i+1
    return dfs.replace(['', 'nan', 'NaN', 'NA', 'null'], [None, None, None,None, None])
# if __name__ == '__main__':
#     process_sql_using_pandas()



### Getting products from db

In [19]:
products = getting_data_from_table(tablename="product")


getting 1693829 records from product as dataframe using 17 chunks: 100%|██████████| 17/17 [01:31<00:00,  5.40s/it]


In [20]:
## From BI

# Loading the data

In [None]:
data = {
    "furniture": ["This text is about chairs.",
               "Couches, benches and televisions.",
               "I really need to get a new sofa."],
    "kitchen": ["There also exist things like fridges.",
                "I hope to be getting a new stove today.",
                "Do you also have some ovens."]
}

In [3]:
import pickle
f = open('/content/drive/MyDrive/data.pkl', 'rb')
data_db = pickle.load(f)
f.close()

UnpicklingError: ignored

# Modeling

## Classification using bert-base

In [None]:
nlp = spacy.blank("en")
nlp.add_pipe(
    "text_categorizer",
    config={
        "data": data,
        "model": "bert-base-multilingual-cased",
        # "cat_type": "zero",
        "device": "gpu"
    }
)

print(nlp("I am looking for kitchen appliances.")._.cats)

# Output:
#
# [{"label": "furniture", "score": 0.21}, {"label": "kitchen", "score": 0.79}]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Ignore MatMul due to non constant B: /[MatMul_149]
Ignore MatMul due to non constant B: /[MatMul_154]
Ignore MatMul due to non constant B: /[MatMul_243]
Ignore MatMul due to non constant B: /[MatMul_248]
Ignore MatMul due to non constant B: /[MatMul_337]
Ignore MatMul due to non constant B: /[MatMul_342]
Ignore MatMul due to non constant B: /[MatMul_431]
Ignore MatMul due to non constant B: /[MatMul_436]
Ignore MatMul due to non constant B: /[MatMul_525]
Ignore MatMul due to non constant B: /[MatMul_530]
Ignore MatMul due to non constant B: /[MatMul_619]
Ignore MatMul due to non constant B: /[MatMul_624]
Ignore MatMul due to non constant B: /[MatMul_713]
Ignore MatMul due to non constant B: /[MatMul_718]
Ignore MatMul due to non constant B: /[MatMul_807]
Ignore MatMul due to non constant B: /[MatMul_812]
Ignore MatMul due to non constant B: /[MatMul_901]
Ignore MatMul due to non constant B: /[MatMul_906]
Ignore MatMul due to non constant B: /[MatMul_995]
Ignore MatMul due to non consta

## Classification using Spacy model

In [None]:
data = {
    "furniture": ["This text is about chairs.",
               "Couches, benches and televisions.",
               "I really need to get a new sofa."],
    "kitchen": ["There also exist things like fridges.",
                "I hope to be getting a new stove today.",
                "Do you also have some ovens."]
}

nlp = spacy.blank("en")
nlp.add_pipe(
    "text_categorizer",
    config={
        "data": data,
        "model": "Sahajtomar/french_semantic",
        "device": "gpu"
    }
)

print(nlp("I am looking for kitchen appliances.")._.cats)

# Output:
#
# [{"label": "furniture", "score": 0.21}, {"label": "kitchen", "score": 0.79}]

In [None]:
print(nlp("texts about dinner tables have multiple labels.")._.cats)

{'furniture': 0.5728549746113443, 'kitchen': 0.4271450253886556}


## Classification using sentence transformer

In [31]:
import pandas as pd 

bi_data = pd.read_csv('classification.csv', header=1)

In [32]:
bi_data

Unnamed: 0,Supplier Name,Procurement Type,Procurement Tool,PO Item Description,PO Text,Unnamed: 5,Unnamed: 6
0,SIEMENS AG,Punchout,Punchout,SINAMICS S120 Single Motor-Module Eingan,,929372.74,0.006495
1,SIEMENS AG,Punchout,Punchout,SIMATIC PCS 7 CPU 410 Redundancy Automat,,701391.00,0.004901
2,SIEMENS AG,Punchout,Punchout,"SIMATIC PCS 7, Software, Upgrade Package",,497655.39,0.003478
3,SIEMENS AG,Punchout,Punchout,Einschub-Leistungsschalter mit Einschubr,,479175.91,0.003348
4,Klaus Faber AG,Punchout,Punchout,NYY-O 01X240 SW,,466938.91,0.003263
...,...,...,...,...,...,...,...
312997,Technische Unie B.V.,Punchout,Punchout,JUNG 1731DD DIM UN LED,6442683,-5492.98,-0.000038
312998,Technische Unie B.V.,Punchout,Punchout,Credit,3ceb0349-b7b6-44f3-b3ed-07e620d23af0,-5741.77,-0.000040
312999,Technische Unie B.V.,Punchout,Punchout,GROH EUROECO/C T WASTKR K/W,4602769,-6144.13,-0.000043
313000,ZUID NEDERLANDSE BUIZEN BV,Punchout,Punchout,KORTING,,-21512.81,-0.000150


UnpicklingError: ignored

In [None]:
type(data)

dict

In [None]:
import pandas as pd
data_df = pd.DataFrame(data)
data_df.shape

(413043, 40)

In [None]:
data_df_clean = data_df.dropna()
data_df_clean.shape

(413009, 40)

In [None]:
data = data_df_clean.to_dict("list")

In [None]:
len(data)

40

In [None]:
nlp = spacy.blank("fr")
nlp.add_pipe(
    "text_categorizer",
    config={
        "data": data,
        "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "device": "gpu"
    }
)


Ignore MatMul due to non constant B: /[MatMul_149]
Ignore MatMul due to non constant B: /[MatMul_154]
Ignore MatMul due to non constant B: /[MatMul_243]
Ignore MatMul due to non constant B: /[MatMul_248]
Ignore MatMul due to non constant B: /[MatMul_337]
Ignore MatMul due to non constant B: /[MatMul_342]
Ignore MatMul due to non constant B: /[MatMul_431]
Ignore MatMul due to non constant B: /[MatMul_436]
Ignore MatMul due to non constant B: /[MatMul_525]
Ignore MatMul due to non constant B: /[MatMul_530]
Ignore MatMul due to non constant B: /[MatMul_619]
Ignore MatMul due to non constant B: /[MatMul_624]
Ignore MatMul due to non constant B: /[MatMul_713]
Ignore MatMul due to non constant B: /[MatMul_718]
Ignore MatMul due to non constant B: /[MatMul_807]
Ignore MatMul due to non constant B: /[MatMul_812]
Ignore MatMul due to non constant B: /[MatMul_901]
Ignore MatMul due to non constant B: /[MatMul_906]
Ignore MatMul due to non constant B: /[MatMul_995]
Ignore MatMul due to non consta

In [None]:
type(nlp)

In [None]:
data_df_clean.iloc[3, :2]

41    Copal Electronics Potentiomètre Rotatif 20kΩ 3...
32    Condensateur tantale KEMET CMS 47μF 10V c.c. b...
Name: 3, dtype: object

In [None]:
max(nlp("Condensateur Crouzet, 480 V c.a., pour 825400")._.cats, key=nlp("Condensateur Crouzet, 480 V c.a., pour 825400")._.cats.get)

'26'

In [None]:
nlp("Condensateur Crouzet, 480 V c.a., pour 825400")._.cats

{'10': 0.00476714603033662,
 '11': 0.009471865317405612,
 '12': 0.02439224916973389,
 '13': 0.019857559123909265,
 '14': 0.004471647388392354,
 '15': 0.025325584801012706,
 '20': 0.006238648388988998,
 '21': 0.0026165190638884565,
 '22': 0.004260088257322503,
 '23': 0.022798049051493227,
 '24': 0.027921228164213063,
 '25': 0.014580753947029663,
 '26': 0.12490890930956529,
 '27': 0.030023244475532337,
 '30': 0.01686975154374761,
 '31': 0.03225414511108506,
 '32': 0.12487818263456757,
 '34': 0.016480075013015256,
 '39': 0.012888013668829086,
 '40': 0.06415657681883101,
 '41': 0.011870912457535457,
 '42': 0.02777276393272875,
 '43': 0.03424097832759138,
 '44': 0.0453848079850196,
 '45': 0.06512114117217559,
 '46': 0.02505363338400749,
 '47': 0.05152498952308339,
 '50': 0.003448615461080589,
 '52': 0.015066773861851844,
 '53': 0.0044097897133425,
 '54': 0.002824302608231271,
 '55': 0.021329691803715505,
 '56': 0.021958640067783988,
 '60': 0.019528080806045428,
 '72': 0.025961421049836814,
