In [41]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

# Ensure you have the necessary NLTK data
import nltk
nltk.download('stopwords')

def preprocess_tweets(file_path):
    # Load dataset with proper delimiter and error handling
    try:
        df = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip', quoting=3)
    except pd.errors.ParserError as e:
        print(f"Error reading the CSV file: {e}")
        return None

    # Initialize TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    # Define stop words
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        # Lowercase the text
        text = text.lower()

        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()

        # Remove URLs and links
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove usernames
        text = re.sub(r'@\w+', '', text)

        # Remove punctuation and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize the text
        tokens = tokenizer.tokenize(text)

        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]

        # Join tokens back into a string
        cleaned_text = ' '.join(tokens)

        return cleaned_text

    # Check if the necessary column exists
    if 'tweet_text' not in df.columns:
        print("The dataset does not contain a 'tweet_text' column.")
        return None

    # Apply the cleaning function to the tweet_text column
    df['cleaned_text'] = df['tweet_text'].apply(clean_text)

    return df

# Example usage
file_path = '/content/balanced_twitter_sample.csv'
cleaned_df = preprocess_tweets(file_path)

if cleaned_df is not None:
    cleaned_df.to_csv('cleaned_twitter_sample.csv', index=False)
    print("Cleaned data has been saved to 'cleaned_twitter_sample.csv'")
else:
    print("Data cleaning failed.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


Cleaned data has been saved to 'cleaned_twitter_sample.csv'


In [43]:
data = pd.read_csv("cleaned_twitter_sample.csv")
data.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used,cleaned_text
0,1049139104452157442,"@laricacosta É o jeito, tudo menos PT :(",Mon Oct 08 03:27:09 +0000 2018,0,:(,jeito tudo menos pt
1,1045508269136125953,@chilena_03 @oiejuao eu xei :),Fri Sep 28 02:59:30 +0000 2018,1,:),eu xei
2,1050743753621860354,Bom dia :) https://t.co/sz3jfwRBA8 https://t.c...,Fri Oct 12 13:43:27 +0000 2018,1,:),bom dia
3,1046766046592606208,Toma uma vaquinha jogando bola p começar essa ...,Mon Oct 01 14:17:28 +0000 2018,1,:),toma uma vaquinha jogando bola p comear essa s...
4,1049169281886429184,"@RexhaBrasil Boa tentativa, meu voto ainda é 1...",Mon Oct 08 05:27:04 +0000 2018,1,:),boa tentativa meu voto ainda


In [44]:
import pandas as pd

def select_columns(file_path):
    # Read the dataset
    df = pd.read_csv(file_path)

    # Check if the necessary columns exist
    if 'cleaned_text' not in df.columns:
        print("The dataset does not contain a 'cleaned_text' column.")
        return None
    if 'sentiment' not in df.columns:
        print("The dataset does not contain a 'sentiment' column.")
        return None

    # Select only the 'cleaned_text' and 'sentiment' columns
    selected_df = df[['cleaned_text', 'sentiment']]

    return selected_df

# Example usage
file_path = 'cleaned_twitter_sample.csv'
selected_df = select_columns(file_path)

if selected_df is not None:
    selected_df.to_csv('selected_twitter_data.csv', index=False)
    print("Selected columns have been saved to 'selected_twitter_data.csv'")
else:
    print("Column selection failed.")


Selected columns have been saved to 'selected_twitter_data.csv'


In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the selected dataframe
selected_df = pd.read_csv('selected_twitter_data.csv')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_df['cleaned_text'], selected_df['sentiment'], test_size=0.2, random_state=42)

# Replace missing values with empty strings
X_train.fillna('', inplace=True)
X_test.fillna('', inplace=True)

# Convert text data to numerical representations using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)
rf_predictions = rf_classifier.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Train Recurrent Neural Network model
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max(len(sequence) for sequence in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

embedding_dim = 100
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
rnn_model.add(LSTM(units=128))
rnn_model.add(Dense(units=1, activation='sigmoid'))
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train_padded, y_train, epochs=5, batch_size=64, verbose=1)
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test_padded, y_test)

# Compare results
print("Random Forest Model Accuracy:", rf_accuracy)
print("Recurrent Neural Network Model Accuracy:", rnn_accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Random Forest Model Accuracy: 0.6936386768447838
Recurrent Neural Network Model Accuracy: 0.5246819257736206


In [49]:
import joblib

# Assuming rf_classifier is already trained and available

# Save the trained Random Forest classifier to a file
joblib.dump(rf_classifier, 'rf_classifier.joblib')
print("Random Forest classifier saved to file 'rf_classifier.joblib'.")

# Load the Random Forest classifier from disk
loaded_rf_classifier = joblib.load('rf_classifier.joblib')

# Example new text data
new_text = ["I love this movie!", "This food is terrible."]

# Transform the new text data using the TfidfVectorizer (tfidf_vectorizer is assumed to be available)
new_text_tfidf = tfidf_vectorizer.transform(new_text)

# Make predictions using the loaded Random Forest classifier
predictions = loaded_rf_classifier.predict(new_text_tfidf)

# Print the sentiment predictions
for i, text in enumerate(new_text):
    sentiment = "positive" if predictions[i] == 1 else "negative"
    print(f"Text: {text} - Predicted Sentiment: {sentiment}")


Random Forest classifier saved to file 'rf_classifier.joblib'.
Text: I love this movie! - Predicted Sentiment: positive
Text: This food is terrible. - Predicted Sentiment: positive
