# **Data Pipeline**

In [44]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import regex as re
import transformers
#import keras
from tensorflow.keras import backend as K
import plotly.express as px
import tweepy


data=pd.read_csv('../input/mbti-type/mbti_1.csv')

In [45]:
def clean_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        #removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        #removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
    
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text

In [46]:
#Split dataset
from sklearn.model_selection import train_test_split

posts = data['posts'].values
labels =  data['type'].values
train_data, test_data = train_test_split(data, random_state=0, test_size=0.2)

train_size = len(train_data)
test_size = len(test_data)
train_size, test_size

In [47]:
#Initialize Bert tokenizer and masks
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-large-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 512

def tokenize_sentences(sentences, tokenizer, max_seq_len = 1800):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences


In [48]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00002))
    return model

In [49]:
model = create_model()    
model.summary()

In [50]:
model.load_weights('../input/social-media-ipd/best_model.h5')

In [51]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [52]:
data['type_index'] = data['type'].apply(get_type_index)

In [53]:
one_hot_labels = tf.keras.utils.to_categorical(data.type_index.values, num_classes=16)

In [55]:
test_labels = tf.keras.utils.to_categorical(data.type_index.values, num_classes=16)

In [56]:
cols = data['type'].unique()
cols = cols.tolist()

colnames = ['sentence']
colnames = colnames+cols


In [57]:
consumer_key="9Msa7w5nsKyRIEnkVMH16I3tq"
consumer_secret = "9aKrskX3cfI1NxdEqW1Pp4xXtnCfNw74FzgwoKrq02zFplrA6r" #same as api secret
access_key = "1530863527715561474-CPiGoHIxrRhQ63zHolcYp5ubIfTU6S"
access_secret = "ABu7MRK1wNpm6D5WRYR0lYbC4tlxzbMB2r1TbIuK7hRUa"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
#api = tweepy.API(auth,wait_on_rate_limit=True)
bearer_token="AAAAAAAAAAAAAAAAAAAAAJN2dAEAAAAAekcTf6JUtL9KB7KS5Y3ItMfK9mo%3DcwdiYtjJr1UxahqiapKa0Ifkpi21dyrF30mefBCINP0Ww5HtKX"

In [58]:
client = tweepy.Client(bearer_token=bearer_token)

user = client.get_user(username='Cristiano')
print(user.data.id)
tweets = client.get_users_tweets(user.data.id)
# tweets.data[0]
data1=tweets.data
posts=""
for word in data1:
    print(word.text)
    posts=posts+"|||"+word.text   
posts=posts[3:]

In [59]:
df_predict = pd.DataFrame(columns = colnames)
sentence=[posts]
sentence_inputs = tokenize_sentences(sentence, tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
sentence_inputs

prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0,'sentence']=sentence
df_predict.loc[0, cols] = prediction

In [60]:
df_predict

In [61]:
df_predict = pd.DataFrame(columns = colnames)
sentence = ["'One time my parents were fighting over my dad's affair and my dad pushed my mom. The fall broke her finger.  She's pointed a gun at him and made him get on his knees and beg for his life. She's...|||I'm gonna talk about what a piece of shit my dad is now.  He's an alcoholic and he has some kind of serious mental problem when it comes to complying with the IRS. (In his words, Laws don't apply...|||OMG...at the women's center I lived at, run by a Catholic charity, the fat bully program manager took it upon herself to change policy so that tenants were FORCED to attend the Christmas party. If...|||I don't work, but I have a calling I am 100% committed to 24/7 with no vacation or off days EVER. I'm a Kundalini mystic.  Oh, I don't get paid, either!  It's one of those destined things...|||My art teacher in high school had a stack of art school catalogs. When I saw the one for the school I ended up going to, I immediately knew that was the one. Without any research. It was like when...|||INFJ  Communication Design  Nope, too much execution, I'm a mystic now|||I got a degree from one of the best schools in the world for that field, but actually having a career in it was unsustainable because it engaged my inferior function directly.  So in my case, even...|||Absolutely|||And I don't know who TF you think in this day and age isn't familiar with employment. ??????? What planet are you living on?! And how did you ever pass kindergarten? Somebody actually employs'"]
sentence_inputs = tokenize_sentences(sentence, tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
sentence_inputs

prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0,'sentence']=sentence
df_predict.loc[0, cols] = prediction

In [62]:
df_predict