<a href="https://colab.research.google.com/github/aksweb/twitter-sentiment-analyser/blob/main/sentiment_analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [48]:
import pandas as pd

# Load the dataset into a Pandas DataFrame
data = pd.read_csv('/content/drive/MyDrive/colab/twitter_training.csv',header=None)
v_data = pd.read_csv('/content/drive/MyDrive/colab/twitter_validation.csv',header=None)
v_data.head()


Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


# Data preprocessing:

In [49]:
## Removing any null values and converting any numerical data into string

dataTxt =data[~data[3].isna()]
dataTxt[3] = dataTxt[3].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTxt[3] = dataTxt[3].astype(str)


In [50]:
textArray=dataTxt[3].values
dataTxt.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [51]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

# Convert the texts into numerical features using a bag-of-words representation
vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(dataTxt[3].values)
X = vectorizer.fit_transform(textArray)

# # Split the data into training and test sets
y = dataTxt[2].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model Selection

In [52]:
from sklearn.naive_bayes import MultinomialNB

# Choose a machine learning algorithm (in this case, Naive Bayes)
model = MultinomialNB()


# Model Training

In [53]:
# Train the model on the training data
model.fit(X_train, y_train)


MultinomialNB()

# Model evaluation:

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model on the test data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.7506081081081081
Precision: 0.7738065821567656
Recall: 0.7341879142891801
F1-score: 0.7436151274612517


# Exporting the model:

In [55]:
import joblib

# Save the model to disk
filename = 'twitter_sentiment_analysis_model.joblib'
joblib.dump(model, filename)

# Load the model from disk
loaded_model = joblib.load(filename)


In [None]:
# Preprocess the test data
v_data_vectorized = vectorizer.transform(v_data[3].values)

# Make predictions on the test data
v_data_predictions = loaded_model.predict(v_data_vectorized)
v_data_predictions


# Testing on validation data:

In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(v_data[2].values, v_data_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(v_data[2].values, v_data_predictions, average='weighted')
print("Precision:", precision)

# Calculate recall
recall = recall_score(v_data[2].values, v_data_predictions, average='weighted')
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(v_data[2].values, v_data_predictions, average='weighted')
print("F1 Score:", f1)


Accuracy: 0.822
Precision: 0.8325439927755617
Recall: 0.822
F1 Score: 0.8212447500879179
