In [157]:
# importing the dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers import DistilBertTokenizer

In [158]:
# importing the dataset
df = pd.read_csv('/content/Twitter_Data.csv')

In [159]:
df = df[:5000]

In [160]:
df.shape

(5000, 2)

In [161]:
df['category'].value_counts()

 1.0    2119
 0.0    1794
-1.0    1087
Name: category, dtype: int64

In [162]:
# checking the null values
df.isnull().sum()

clean_text    1
category      0
dtype: int64

In [163]:
# dropping the null values
df.dropna(inplace = True)

In [164]:
df.shape

(4999, 2)

In [165]:
df.reset_index(inplace = True)

In [166]:
# encoding the values of  output column

df['encoded_output'] = df['category'].astype('category').cat.codes

In [167]:
df.head()

Unnamed: 0,index,clean_text,category,encoded_output
0,0,when modi promised “minimum government maximum...,-1.0,0
1,1,talk all the nonsense and continue all the dra...,0.0,1
2,2,what did just say vote for modi welcome bjp t...,1.0,2
3,3,asking his supporters prefix chowkidar their n...,1.0,2
4,4,answer who among these the most powerful world...,1.0,2


In [168]:
x = list(df['clean_text'])
y = list(df['encoded_output'])

In [169]:
# spliting into train and test data

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 4)

In [170]:
# creating the tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [171]:
# tokenizing all the input(train and validation) data

train_encoding = tokenizer(x_train,truncation = True,padding = True)
test_encoding = tokenizer(x_test,truncation = True,padding = True)

In [172]:
# converting the tokenized input data into tensor objects

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encoding),y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),y_test))

In [173]:
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_steps = 500
)

In [174]:
# training the model

with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [175]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-2.7037625,  4.8039227, -2.2044668],
       [-2.971625 , -2.1161304,  4.8654585],
       [-2.4937053, -2.5231757,  4.8928986],
       ...,
       [-2.2591221,  4.4722524, -2.1967542],
       [-2.7583435, -2.3707404,  4.878491 ],
       [-2.6027758, -2.3211107,  4.7529283]], dtype=float32), label_ids=array([1, 2, 2, 1, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 0, 0,
       0, 0, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2,
       0, 2, 1, 1, 1, 2, 2, 0, 1, 2, 2, 0, 2, 1, 1, 1, 0, 2, 1, 0, 2, 2,
       2, 1, 0, 0, 1, 0, 2, 2, 1, 2, 1, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 0,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 0, 2, 1, 0, 2, 2, 1, 1, 0, 2, 2, 2, 2,
       1, 0, 2, 1, 2, 0, 1, 1, 2, 2, 1, 1, 1, 2, 2, 0, 1, 1, 0, 1, 1, 1,
       2, 0, 2, 2, 1, 0, 0, 0, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       1, 2, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 2, 2,
     

In [176]:
trainer.evaluate()

{'eval_loss': 0.04485230218796503}

In [177]:
# save the model
save = "trained_model"
model.save_pretrained(save)
tokenizer.save_pretrained(save)

('trained_model/tokenizer_config.json',
 'trained_model/special_tokens_map.json',
 'trained_model/vocab.txt',
 'trained_model/added_tokens.json')

In [178]:
# load the model

trained_tokenizer = DistilBertTokenizer.from_pretrained(save)
trained_model = TFDistilBertForSequenceClassification.from_pretrained(save)

Some layers from the model checkpoint at trained_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_259']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at trained_model and are newly initialized: ['dropout_279']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [179]:
# Prediction System

def sentiment_Analyzer(text):

  text.lower()

  prediction = ['Negetive','Neutral','Positive']

  predict_output = trained_tokenizer.encode(text,truncation = True,padding = True,return_tensors = 'tf')

  output = trained_model.predict(predict_output)[0]
  return prediction[np.argmax(output)]

In [180]:
sentiment_Analyzer("peteralponse his campaign against modi thru media debates created rahul wave tnto garner minority support meets the president archbishops council madurai with feature stalian and secular candidates")



'Neutral'