In [None]:
# This is just a cell with all the other cells combined so they don't have to be run individually
# Was made using Google Colab

# Because I used Google Colab, it can only take
# one .csv file per run so this code will have to be re-run everytime
# when training with a different.csv file.
# File upload code is also Google Colab speficially.
# So separately train using Amazon.csv, Tweets.csv, and YouTube.csv
# and each time run this code cell.

# To install required libraries, uncomment the line below:
# !pip install tensorflow

import pandas as pd
import numpy as np
import re
import string
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from google.colab import files


def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text)

    return text.strip().lower()


def preprocess_data():
  uploaded = files.upload()
  filename = next(iter(uploaded))
  df = pd.read_csv(filename)

  LABEL_MAP = {"negative": 0, "neutral": 1, "positive": 2}

  df.dropna(subset=['text'], inplace=True)
  df['sentiment'] = df['sentiment'].map(LABEL_MAP)
  df['text'] = df['text'].apply(preprocess_text)
  texts = df['text'].tolist()
  labels = df['sentiment'].astype(int).tolist()

  return texts, labels, LABEL_MAP


def tokenize_text(texts, max_features, max_length):

  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(texts)
  sequences = tokenizer.texts_to_sequences(texts)
  X = pad_sequences(sequences, maxlen=max_length)

  return X


def CNN_model(max_features, embedding_dim, max_length, num_filters, kernel_size, hidden_dim):
  CNN_model = models.Sequential()

  CNN_model.add(layers.Embedding(max_features, embedding_dim, input_length=max_length))
  CNN_model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
  CNN_model.add(layers.GlobalMaxPooling1D())
  CNN_model.add(layers.Dense(hidden_dim, activation='relu'))
  CNN_model.add(layers.Dense(3, activation='softmax'))

  CNN_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  return CNN_model


max_features = 1000
max_length = 200
embedding_dim = 100
num_filters = 128
kernel_size = 3
hidden_dim = 128
epochs = 10
batch_size = 16

# Preprocess data
texts, labels, LABEL_MAP = preprocess_data()

X = tokenize_text(texts, max_features, max_length)
y = np.array(labels)

X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Train CNN model
model = CNN_model(max_features, embedding_dim, max_length, num_filters, kernel_size, hidden_dim)
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))

In [None]:
# To install required libraries, uncomment the line below:
# !pip install tensorflow

import pandas as pd
import numpy as np
import re
import string
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from google.colab import files

In [None]:
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text)

    return text.strip().lower()

In [None]:
def preprocess_data():
  uploaded = files.upload()
  filename = next(iter(uploaded))
  df = pd.read_csv(filename)

  LABEL_MAP = {"negative": 0, "neutral": 1, "positive": 2}

  df.dropna(subset=['text'], inplace=True)
  df['sentiment'] = df['sentiment'].map(LABEL_MAP)
  df['text'] = df['text'].apply(preprocess_text)
  texts = df['text'].tolist()
  labels = df['sentiment'].astype(int).tolist()

  return texts, labels, LABEL_MAP

In [None]:
def tokenize_text(texts, max_features, max_length):

  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(texts)
  sequences = tokenizer.texts_to_sequences(texts)
  X = pad_sequences(sequences, maxlen=max_length)

  return X

In [None]:
def CNN_model(max_features, embedding_dim, max_length, num_filters, kernel_size, hidden_dim):
  CNN_model = models.Sequential()

  CNN_model.add(layers.Embedding(max_features, embedding_dim, input_length=max_length))
  CNN_model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
  CNN_model.add(layers.GlobalMaxPooling1D())
  CNN_model.add(layers.Dense(hidden_dim, activation='relu'))
  CNN_model.add(layers.Dense(3, activation='softmax'))

  CNN_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  return CNN_model

In [None]:
# Because I used Google Colab, it can only take
# one .csv file per run so this code will have to be re-run everytime
# when training with a different.csv file.
# So separately train using Amazon.csv, Tweets.csv, and YouTube.csv
# and each time run this code cell.


max_features = 1000
max_length = 200
embedding_dim = 100
num_filters = 128
kernel_size = 3
hidden_dim = 128
epochs = 10
batch_size = 16

# Preprocess data
texts, labels, LABEL_MAP = preprocess_data()

X = tokenize_text(texts, max_features, max_length)
y = np.array(labels)

X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Train CNN model
model = CNN_model(max_features, embedding_dim, max_length, num_filters, kernel_size, hidden_dim)
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_test, y_pred, target_names=LABEL_MAP.keys()))