In [None]:
#  Aisha Nawaz 

# Sentiment Analysis on Social Media Comments
# Client Name: SocialBuzz Inc. [Fictional]
# Company Name: TextInsight AI [Fictional]

# Description: SocialBuzz Inc. is a social media marketing agency looking to gauge public sentiment 
# towards their clients' brands. They have approached TextInsight AI to develop a sentiment analysis 
# model capable of analyzing social media comments and identifying whether the sentiment towards a brand
# or product is positive, negative, or neutral.

# Dataset: Sentiment Analysis for Social Media Comments (Online Dataset)
# For this project, you can use publicly available datasets from platforms like Kaggle, Twitter API,
# or other social media platforms that provide labeled data for sentiment analysis.

# Steps:

In [1]:
# 1)  Data Collection: Acquire a dataset containing social media comments along with their sentiment
# labels (positive, negative, or neutral).

# I OBTAINED DATASET FROM :https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset
import pandas as pd
dataset=pd.read_csv('datasetw5d4.csv')
dataset.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [3]:
# 2) Data Cleaning: Preprocess the text data by removing special characters, stop words, and performing tokenization.
dataset.isnull().sum()        #Finding count of null values
dataset.dropna(inplace=True) #Dropping null values found above

In [5]:
#I want to keep the cleaned data seperate so I am copying into new dataset for further steps
datanew=dataset.copy() 

In [7]:
# Removing stop words , Special characters & Performing tokenization using nltk built-in library

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def cleaner(sentences):
    sentences=str(sentences) #Converting input to string type
    
    # Removing stop words first:
    stopWords = set(stopwords.words('english'))  #Obtaining all stop words in english language
    inputWordTokens = word_tokenize(sentences)   #Tokenzing the input sentence passed
    
    # Converting words to lowercase to check if it is present in list of stopwords
    CleanedSentenceV1 = [word.lower() for word in inputWordTokens if word.lower() not in stopWords]
      
    # Removing special characters
    translationTable = str.maketrans('', '', string.punctuation) #obtaining translation table to help remove special chars
    
    results=[]
    for word in CleanedSentenceV1:
        translated=word.translate(translationTable) ##Applying the translation table to each word in input sentence 
        if(translated):
            results.append(translated) #Appending it to result list (if there is anything left)
            
            
    results = ' '.join(results) #Converting the result list into string form to return
    return results                  
       
#Creating a new column  'tokenised' to save the newly cleaned text versions
datanew['Tokenised']=datanew['clean_text'].apply(cleaner)
datanew.tail() #To visualize results

Unnamed: 0,clean_text,category,Tokenised
162975,why these 456 crores paid neerav modi not reco...,-1.0,456 crores paid neerav modi recovered congress...
162976,dear rss terrorist payal gawar what about modi...,-1.0,dear rss terrorist payal gawar modi killing 10...
162977,did you cover her interaction forum where she ...,0.0,cover interaction forum left
162978,there big project came into india modi dream p...,0.0,big project came india modi dream project happ...
162979,have you ever listen about like gurukul where ...,1.0,ever listen like gurukul discipline maintained...


In [8]:
# 3) Feature Extraction: Use techniques like TF-IDF or word embeddings to convert text data into numerical representations.

#NOTE: I AM USING THE TECHNIQUE "Frequency-Inverse Document Frequency":

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib 

def verctorizeIT(data): 
    vectorizer = TfidfVectorizer() #intializing it
    
    #Training 
    vectorizer.fit(data)   
    
    # Saving trained vectorizer for later use
    joblib.dump(vectorizer, "vectorizer.pkl")
    
    #Converting text into numerical representations & returning it
    return vectorizer.transform(data)
   
vectors_X=verctorizeIT(datanew['Tokenised'])           #This is x-variable
variable_Y=datanew['category']                         #This is y-variable

print("X: ",vectors_X[0]," Y: ",variable_Y[0]) #Printing first values to see variables

X:    (0, 105462)	0.13205919442781192
  (0, 94728)	0.2597606140610556
  (0, 93782)	0.1471128844177335
  (0, 91057)	0.34089487583722683
  (0, 80394)	0.3455799718157834
  (0, 77499)	0.29032480843432995
  (0, 76893)	0.18242154961866305
  (0, 62445)	0.0366348236984051
  (0, 61601)	0.2073981288388287
  (0, 60281)	0.23809359712369188
  (0, 51950)	0.22293922558863355
  (0, 51322)	0.17011977351496282
  (0, 40506)	0.13783853251609857
  (0, 40478)	0.2109793533817779
  (0, 39378)	0.13830622626613456
  (0, 34686)	0.22172681719264056
  (0, 34621)	0.27640487683701465
  (0, 29330)	0.22374422023461862
  (0, 17899)	0.19868875839963557
  (0, 13679)	0.25062492796726993  Y:  -1.0


In [9]:
# 4)  Model Selection: Choose a machine learning or deep learning model (e.g., Naive Bayes, LSTM) for sentiment classification.
# Tools: Python, Jupyter Notebook, Pandas, Matplotlib, NLTK, Scikit-learn, TensorFlow/Keras

#NOTE: I AM USING Naive Bayes FOR THIS PART.

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import joblib

#Splitting the datset into train & test: 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(vectors_X, variable_Y, test_size=0.2, random_state=42) 

#Initializing NB classifier
classifier = MultinomialNB()
#Training it
classifier.fit(X_train, y_train)

#Saving classifier for later use
joblib.dump(classifier, "SentimentAnalysisModel_w5d4.pkl")

['SentimentAnalysisModel_w5d4.pkl']

In [11]:
# Predicting on test data now:
y_pred = classifier.predict(X_test)
print(X_test[0:5],"Prediction: ",y_pred[0:5]) #Seeing first 5 predictions raw results

  (0, 97980)	0.4271501673145705
  (0, 67001)	0.24272965826364015
  (0, 66011)	0.20672158148764047
  (0, 62445)	0.0570055059880635
  (0, 36819)	0.4703088724272155
  (0, 18452)	0.5203898831287828
  (0, 10813)	0.3948873289821749
  (0, 6044)	0.2543806971045976
  (1, 92837)	0.3554917045584327
  (1, 72642)	0.10348589925964967
  (1, 65836)	0.10648779544621347
  (1, 64979)	0.2415662206115273
  (1, 62445)	0.03387738327757818
  (1, 59465)	0.22893386820471173
  (1, 51589)	0.20599612032913947
  (1, 45087)	0.20559296045786132
  (1, 35152)	0.6714901320521551
  (1, 32161)	0.22893386820471173
  (1, 30257)	0.23176081898361262
  (1, 23533)	0.10646183577298962
  (1, 15657)	0.10275090020279957
  (1, 6280)	0.19438417349412632
  (1, 5465)	0.19560495773740746
  (2, 105761)	0.18084217033037123
  (2, 105462)	0.12191404072905129
  :	:
  (3, 9910)	0.19602719481078243
  (3, 5080)	0.20394456326594848
  (3, 4810)	0.17955605692283771
  (4, 105462)	0.1121937365216682
  (4, 98681)	0.1521595712901906
  (4, 97258)	0.155

In [13]:
# 5) Model Evaluation: Evaluate the model's performance using metrics such as accuracy, precision, recall, and F1-score.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > EVALUATING MODEL\'S PERFORMANCE < ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print("Accuracy--->>", round(accuracy_score(y_test, y_pred),3),"%")
print("Precision--->>", round(precision_score(y_test, y_pred,average='weighted'),3),"%")
print("Recall--->>", round(recall_score(y_test, y_pred,average='weighted'),3),"%")
print("F1-score--->>", round(f1_score(y_test, y_pred,average='weighted'),3),"%")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > EVALUATING MODEL'S PERFORMANCE < ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Accuracy--->> 0.576 %
Precision--->> 0.733 %
Recall--->> 0.576 %
F1-score--->> 0.514 %


In [21]:
# 6) Deployment: Deploy the sentiment analysis model as an API that SocialBuzz Inc.
# can use to analyze social media comments and track brand sentiment.

from flask import Flask, request, jsonify, render_template
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


def getResults(predictions): #Basically further clarifies the model's predictions
    #Prediction of 0 Indicates  Neutral , 1 Indicates Postive,-1 Indicates Negative 
    clearResults=[]
    
    for pred in predictions:
        if (pred==0):
            clearResults.append('Neutral')
        elif (pred==1):
            clearResults.append('Positive')
        elif (pred==-1):
            clearResults.append('Negative')
            
    return clearResults
            
    
# Initializing Flask app
app = Flask(__name__)

# Loading the model & vectorizer saved above
model = joblib.load("SentimentAnalysisModel_w5d4.pkl")
vectorizer = joblib.load("vectorizer.pkl")

@app.route('/', methods=['POST', 'GET'])
def sentimentAnalysis():
    if request.method == 'POST':
        try:
            #Getting comments from html & splitting them 
            rawComments=request.form['comments'].split(",") 
            
            comments = [comment.strip() for comment in rawComments]  #Removing spaces from each comment
            
            #Calling the cleaner function i made above to preprocess each comment in the list
            preprocessedComments = [cleaner(comment) for comment in comments]
            
            #Using the trained vectorizer I loaded above to vectorize each preprocessed comment in the list
            vectors=vectorizer.transform(preprocessedComments)

            #Making predictions
            predictions = model.predict(vectors)
           
            sentimentLabels = getResults(predictions) #Clarifying results 
            
            return render_template('model.html', sentiments=sentimentLabels) #Sending response back to web/html page

        except Exception as e:
            return jsonify({'error': str(e)}), 400

    else:
        return render_template('model.html')

if __name__ == '__main__':
    app.run(debug=True, port=8000,use_reloader=False) 

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:8000
Press CTRL+C to quit
127.0.0.1 - - [28/Jul/2023 19:43:15] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Jul/2023 19:43:55] "POST / HTTP/1.1" 200 -
