In [2]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [3]:
df = pd.read_csv("Twitter_Data.csv")

In [4]:
df.shape

(162980, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [6]:
df.sample(10)

Unnamed: 0,clean_text,category
33282,looks the government not bothered take any act...,1.0
58400,madam does modi work defence force guess but y...,1.0
40357,modi wants come beed have any problem,1.0
99105,modi sidelined not only sahtru others also sma...,1.0
111527,can arrogant with photographers and security p...,0.0
143733,what modi chcha did,0.0
69112,nation with 300 million without toilets good g...,1.0
45373,much interested modi hmm,1.0
138004,what does modi mean congress mukta bharat the ...,-1.0
65580,for now know the diff between mms and modi see...,0.0


In [7]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [8]:
df['clean_text']=df['clean_text'].astype(str)

In [9]:
sentences = df.clean_text.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = (["[CLS] " + sentence + " [SEP]" for sentence in sentences])

categories= df.category.values

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
print ("Tokenize the first sentence:")
print (tokens[0])

Tokenize the first sentence:
['[CLS]', 'when', 'm', '##od', '##i', 'promised', '“', 'minimum', 'government', 'maximum', 'governance', '”', 'expected', 'him', 'begin', 'the', 'difficult', 'job', 'reform', '##ing', 'the', 'state', 'why', 'does', 'take', 'years', 'get', 'justice', 'state', 'should', 'and', 'not', 'business', 'and', 'should', 'exit', 'ps', '##us', 'and', 'temples', '[SEP]']


In [11]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokens]

In [12]:
max_len=250
input_ids=np.zeros((len(df),max_len))
attention_mask=np.zeros((len(df),max_len))
input_ids.shape,attention_mask.shape

((162980, 250), (162980, 250))

In [14]:
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 130

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN , truncating="post", padding="post")

In [15]:
attention_masks = []

for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]
    
  attention_masks.append(att_mask)

In [23]:
input_ids = input_ids.astype(float)

In [24]:
from sklearn.model_selection import train_test_split
train_inputs, val_inputs, train_cate, val_cate = train_test_split(input_ids, categories, random_state=2018,test_size=0.1)


In [25]:
train_mask, val_mask,_,_ = train_test_split(attention_mask, input_ids, random_state=2018, test_size=0.1)

In [30]:
val_inputs.dtype

torch.float64

In [26]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_cate = torch.tensor(train_cate)
val_labels = torch.tensor(val_cate)
train_mask = torch.tensor(train_mask)
val_mask = torch.tensor(val_mask)

In [35]:
batch_size = 32
#model
train_data = TensorDataset(train_inputs, train_mask, train_cate)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(val_inputs, val_mask, val_cate)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [32]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

100%|██████████| 407873900/407873900 [00:09<00:00, 45135999.36B/s]


In [None]:
#model.predict(sample_dataset)