<a href="https://colab.research.google.com/github/Zeidh-Hassim/predictionPretrolConsumption/blob/main/Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1 – Load the Dataset

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
# Replace 'path_to_dataset.csv' with the actual path to your dataset file
df = pd.read_csv('Tweets.csv')

# Select only the necessary columns
df = df[["airline_sentiment", "text"]]


In [None]:
df.head(5)

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


In [None]:
df.shape

(14640, 3)

# Step 2 – Preprocess Text

In [None]:
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Initialize Porter Stemmer
ps = PorterStemmer()

# Define function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)  # Remove URLs
    tokens = word_tokenize(text)  # Tokenize text into words
    stopwords_list = stopwords.words('english')
    filtered_tokens = [ps.stem(word) for word in tokens if word not in stopwords_list]  # Stemming and remove stopwords
    return " ".join(filtered_tokens)

# Apply clean_text function to text column and create new column text_cleaned
df['text_cleaned'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
df.shape

(14640, 3)

In [None]:
df.head(5)

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...
3,negative,@VirginAmerica it's really aggressive to blast...,@ virginamerica 's realli aggress blast obnoxi...
4,negative,@VirginAmerica and it's a really big bad thing...,@ virginamerica 's realli big bad thing


In [None]:
df.groupby('airline_sentiment').describe()

Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,9178,9087,@AmericanAir that's 16+ extra hours of travel ...,2,9178,9083,@ americanair fyi ... call still get drop . ho...,2
neutral,3099,3067,@SouthwestAir sent,5,3099,3025,@ jetblu 's ceo battl appeas passeng wall stre...,8
positive,2363,2298,@JetBlue thanks!,5,2363,2262,@ jetblu thank !,12


# Step 3 – Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000)

# Generate TF-IDF vectors
X = tfidf_vectorizer.fit_transform(df['text_cleaned']).toarray()

# Convert sentiment labels to numerical values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df['airline_sentiment'])


In [None]:
df.shape

(14640, 3)

#Step 4 – Train Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict sentiment labels for test dataset
y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy of Naive Bayes classifier
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy of Multinomial Naive Bayes Classifier: {accuracy_nb:.2f}")

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Predict sentiment labels for test dataset
y_pred_rf = rf_classifier.predict(X_test)

# Calculate accuracy of Random Forest classifier
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest Classifier: {accuracy_rf:.2f}")


Accuracy of Multinomial Naive Bayes Classifier: 0.72
Accuracy of Random Forest Classifier: 0.75
