In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
from transformers import pipeline
import torch
import torch.nn.functional as F

In [6]:
#default pipeline with default model
classifier = pipeline("sentiment-analysis")

In [7]:
# Simple classification
result = classifier('I am happy while write this notebook')
print(result)

In [8]:
# Simple classification but with lists of strings
result = classifier(['I am happy while write this notebook','However I was sad yesterday'])
print(result)

In [10]:
# We are done with default pipeline with default model
# Now lets use concreate model and concreate tokenizer

# now we are using distil model with small and faster version of BERT but pretrained on same corpus
# It was also finetuned and sst-2-english is name of the dataset ( english dataset from stanford sentiment tree bank V2)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
classifier = pipeline("sentiment-analysis",model=model_name)
result = classifier(['I am happy while write this notebook','However I was sad yesterday'])
print(result)

In [12]:
# AutoTokenizer -> Generic ; AutoModelForSequenceClassification -> Specific
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis",model=model, tokenizer=tokenizer)
result = classifier(['I am happy while write this notebook','However I was sad yesterday'])
print(result)

In [14]:
# Lets see tokenizer do
# 1st way
tokens = tokenizer.tokenize("I am happy while write this notebook")
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# 2nd way
input_ids = tokenizer("I am happy while write this notebook")

print(f'Tokens: {tokens}')
# ids are unique numerical representations of all words
print(f'Token Ids: {token_ids}')
# same but with beginnning of string and ending of string
print(f'Input Ids: {input_ids}')

In [19]:
Xtrain = ['I am happy while write this notebook','However I was sad yesterday']

batch = tokenizer(Xtrain,padding=True,truncation=True,max_length=512,return_tensors="pt")# pt means pytorch

with torch.no_grad():
    outputs = model(**batch) # ** used with pytorch # want the matrixes
    print(outputs)
    predictions = F.softmax(outputs.logits ,dim=1) # tensors
    print(predictions)
    labels = torch.argmax(predictions, dim=1) # in 1 or 0
    print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]# list comprehension #id2label is only available for SequenceClassification
    print(labels) # 'POSITIVE' or 'NEGATIVE'

In [22]:
# If we have a loss we want to inspect
Xtrain = ['I am happy while write this notebook','However I was sad yesterday']

batch = tokenizer(Xtrain,padding=True,truncation=True,max_length=512,return_tensors="pt")# pt means pytorch

with torch.no_grad():
    outputs = model(**batch,labels=torch.tensor([1,0])) # ** used with pytorch # want the matrixes 
    # label argument for loss is probablt available only for SequenceClassification
    print(outputs)
    predictions = F.softmax(outputs.logits ,dim=1) # tensors
    print(predictions)
    labels = torch.argmax(predictions, dim=1) # in 1 or 0
    print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]# list comprehension #id2label is only available for SequenceClassification
    print(labels) # 'POSITIVE' or 'NEGATIVE'
    
# So basically using pipeline or using doing our selves gives us the same output.
# using model and tokenizer for manual work is important when we want to fine tune our model

In [32]:
# Now to save our 'finetuned' model and tokenizer

#save_directory = "saved"
#tokenizer = AutoTokenizer.from_pretrained(save_directory)
#model.save_pretrained(save_directory)


In [33]:
# Now to load our 'finetuned' model and tokenizer

#save_directory = "saved"
#tokenizer = AutoTokenizer.from_pretrained(save_directory)
#model = AutoModelForSequenceClassication.from_pretrained(save_directory)

In [34]:
# Now lets use a different model
# We can load a model from a local directory if available 
# Or go to https://huggingface.co/models
# Here is modelhub where we can search for difference models
# We can filter thorugh Tasks, Libraries, Datasets, Languages, Licenses

In [38]:
#German sentiment analysis
model_name = "oliverguhr/german-sentiment-bert"
 
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

texts = ["Mit keinem guten Ergebnis","Das war gut!","Sie fahrt ein grunes Auto."]

batch = tokenizer(texts,padding=True,truncation=True,max_length=512,return_tensors="pt")

print(batch) 
with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    label_ids = torch.argmax( outputs.logits,dim=1 )
    
    
    labels = [model.config.id2label[label_id] for label_id in  label_ids.tolist()]
    print(labels)

In [39]:
# if we don't use pt it will return simple lists which we can convert into tensors by torch.tensor(batch['input_ids'])
# By doing this we won't have unpack using **

#German sentiment analysis
model_name = "oliverguhr/german-sentiment-bert"
 
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

texts = ["Mit keinem guten Ergebnis","Das war gut!","Sie fahrt ein grunes Auto."]

batch = tokenizer(texts,padding=True,truncation=True,max_length=512)
batch = torch.tensor(batch["input_ids"])
print(batch) 
with torch.no_grad():
    outputs = model(batch)
    print(outputs)
    label_ids = torch.argmax( outputs.logits,dim=1 )
    
    
    labels = [model.config.id2label[label_id] for label_id in  label_ids.tolist()]
    print(labels)