<a href="https://colab.research.google.com/github/awesome1021/awesome1021.github.io/blob/master/Sentiment_Analysis_Text_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
from sklearn.model_selection import train_test_split 
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive 
import random 
import numpy as np
try:
  %tensorflow_version 2.x
except: Exception
pass
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import pandas as pd

# Reading data
sentiment_data_path = '/content/drive/My Drive/Colab Notebooks/sentiment_text_data.csv'
threat_data_url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
dfS = pd.read_csv(sentiment_data_path) # Sentiment classification
dfT = pd.read_csv(threat_data_url) # Offensive statement classification 
dfS.columns = ['label', 'text']
dfS.to_csv('sentiment.csv', index = False)
dfS = pd.read_csv('sentiment.csv')

# Cleaning up sentiment database
dfS.loc[dfS.label == 4, 'label'] = 1
for i, row in dfS.iterrows():
  dfS.at[i, 'text'] = " ".join(filter(lambda x:x[0]!='@', dfS.at[i, 'text'].split()))

# Splitting the databases into 75% for training data and 25% for test data
XS = dfS['text']
YS = dfS['label']
XS_train, XS_test, YS_train, YS_test = train_test_split(XS, YS, test_size = 0.25)
XT = dfT['tweet']
YT = dfT['class']
XT_train, XT_test, YT_train, YT_test = train_test_split(XT, YT, test_size = 0.25)

# Converting into TensorFlow datasets
sentiment_train_data = tf.data.Dataset.from_tensor_slices((XS_train.values, YS_train.values))
sentiment_test_data = tf.data.Dataset.from_tensor_slices((XS_test.values, YS_test.values))
threat_train_data = tf.data.Dataset.from_tensor_slices((XT_train.values, YT_train.values))
threat_test_data = tf.data.Dataset.from_tensor_slices((XT_test.values, YT_test.values))

In [0]:
# Training the data for sentiment
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype = tf.string, trainable = True)
modelS = tf.keras.Sequential()
modelS.add(hub_layer)
modelS.add(tf.keras.layers.Dense(16, activation='relu'))
modelS.add(tf.keras.layers.Dense(16, activation='relu'))
modelS.add(tf.keras.layers.Dense(1, activation='sigmoid'))
modelS.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
modelS.summary()

train_test_size = len(XS_train)
batch_size = 10
train_tfds = sentiment_train_data.shuffle(train_test_size).batch(batch_size)
test_tfds = sentiment_test_data.shuffle(train_test_size).batch(batch_size)
history = modelS.fit(train_tfds, epochs=2, validation_data=test_tfds, verbose=1)

results = modelS.evaluate(test_tfds, verbose=2)
for name, value in zip(modelS.metrics_names, results):
  print("%s: %.3f" % (name, value)) 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 124,645,041
Trainable params: 124,645,041
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
200/200 - 1s - loss: 0.6167 - accuracy: 0.7150
loss: 0.617
accuracy: 0.715


In [0]:
# Training the data for threats
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype = tf.string, trainable = True)
modelT = tf.keras.Sequential()
modelT.add(hub_layer)
modelT.add(tf.keras.layers.Dense(16, activation='relu'))
modelT.add(tf.keras.layers.Dense(16, activation='relu'))
modelT.add(tf.keras.layers.Dropout(0.2))
modelT.add(tf.keras.layers.Dense(3, activation=tf.nn.softmax))
modelT.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
modelT.summary()

train_test_size = len(XT_train)
batch_size = 10
train_tfds = threat_train_data.shuffle(train_test_size).batch(batch_size)
test_tfds = threat_test_data.shuffle(train_test_size).batch(batch_size)
history = modelT.fit(train_tfds, epochs=3, validation_data=test_tfds, verbose=1)

results = modelT.evaluate(test_tfds, verbose=2)
for name, value in zip(modelT.metrics_names, results):
  print("%s: %.3f" % (name, value))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 128)               124642688 
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 51        
Total params: 124,645,075
Trainable params: 124,645,075
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
620/620 - 3s - loss: 0.5357 - accuracy: 0.8565
loss: 0.536
accuracy: 0.857


In [0]:
# Check for the effectiveness of sentiment classification
def check_sentiment(value):
  if value == 0:
    print("Likely negative sentiment (anger, sadness, etc.)")
  if value == 1:
    print("Likely positive sentiment (happiness, excitement, etc.)")

index = random.randint(0, len(dfS) - 1)
string = dfS['text'].values[index]
print(string)
prediction = modelS.predict_classes([string], batch_size=10)
check_sentiment(prediction)

still procrastinating... i hate organizing my clothes there's just so much....
Likely negative sentiment (anger, sadness, etc.)


In [0]:
# Checking the effectiveness of threat classification
def check_threat(value):
  if value == 0:
    print("Likely hate speech or threatening language")
  if value == 1:
    print("Potentially threatening language")
  if value == 2:
    print("Likely non-threatening language")

index = random.randint(0, len(dfT) - 1)
string = dfT['tweet'].values[index]
print(string)
prediction = modelT.predict_classes([string], batch_size = 5)
check_threat(prediction)

&#8220;@NiggaImTatted: Cowboys win games during the regular season but when they get to the playoffs that trash af lol&#8221; hating ass nigga
Likely hate speech or threatening language
