# Import all the required libraries

In [168]:
import pandas as pd
import nltk
import spacy
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import GridSearchCV
import spacy
from afinn import Afinn
import pickle
seed=0

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahub\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text exploration and preprocess

In [145]:
emails= pd.read_csv(r"email_class.csv",index_col=False)

In [75]:
emails.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [48]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [49]:
# duplicated values
emails.duplicated().sum()

415

In [146]:
# drop duplicates
emails=emails.drop_duplicates()

In [51]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5157 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5157 non-null   object
 1   Message   5157 non-null   object
dtypes: object(2)
memory usage: 120.9+ KB


In [147]:
# define a function to preprocess the text
def preprocess_text(row):
    # remove white space
    row=row.strip()
    # Lowercase
    row=row.lower()
    #remove punctuation
    row= ''.join([char for char in row if char not in string.punctuation])
    #tokeization
    tokens=word_tokenize(row)
    # create an instance of english stopwords
    stopwordss=stopwords.words('english')
    #remove stopwords
    tokens=[x for x in tokens if not x in stopwordss]
    # create an instance of stem
    ps=PorterStemmer()
    # stem the tokens
    stems=[ps.stem(x) for x in tokens]
    return " ".join(stems)

# apply the function to all the rows
emails['preprocessed_Message']=emails['Message'].apply(preprocess_text)

In [78]:
# preview of data
emails.head(3)

Unnamed: 0,Category,Message,preprocessed_Message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...


In [177]:
# crate the tfidf vectorizer
tfidf=TfidfVectorizer(max_features=5000)
# fit the data and store it into X
x=tfidf.fit_transform(emails['preprocessed_Message'])
# output 
y=emails['Category']
x.shape

(5157, 5000)

In [178]:
with open("tfidf.pkl",'wb') as file:
    pickle.dump(tfidf,file)

# Naive Bayes- Text classification model Building:

In [149]:
# split the data
train_x,test_x, train_y, test_y= train_test_split(x,y,test_size=0.3, random_state=seed)

In [150]:
#Hyperparameter-tuning
param_grid={'alpha':[0.1, 0.5, 1.0, 2.0, 5.0]}

# Gridsearch
nb=MultinomialNB()
grid_cv=GridSearchCV(nb,param_grid,cv=5)
grid_cv.fit(train_x,train_y)
print("best params;",grid_cv.best_params_)

# final model
nb_model=grid_cv.best_estimator_

best params; {'alpha': 0.1}


# Evaluation

In [151]:
# predictions for train and test data
train_pred=nb_model.predict(train_x)
test_pred=nb_model.predict(test_x)

# function for evaluation
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred,average='weighted')
    recall = recall_score(y_true, y_pred,average='weighted')
    f1 = f1_score(y_true, y_pred,average='weighted')
    return accuracy, precision, recall, f1

train_metrics = evaluate_model(train_y, train_pred)
test_metrics = evaluate_model(test_y, test_pred)

metric=['Accuracy','Precision','Recall','F1_score']

# Compile results into a DataFrame
results = pd.DataFrame({'Metrics':['Accuracy','Precision','Recall','F1_score'],
    'Train_data': [train_metrics[0],train_metrics[1],train_metrics[2],train_metrics[3]],
    'Test_data': [test_metrics[0],test_metrics[1],test_metrics[2],test_metrics[3]]})

print(results)

     Metrics  Train_data  Test_data
0   Accuracy    0.995012   0.980620
1  Precision    0.995005   0.980370
2     Recall    0.995012   0.980620
3   F1_score    0.994983   0.980404


In [152]:
results_df = pd.DataFrame({
    'True_Label': train_y,  # The true labels from the training set
    'Predicted_Label': train_pred  # The predicted labels from the model
})
results_df.tail()

Unnamed: 0,True_Label,Predicted_Label
5320,ham,ham
3462,ham,ham
1711,ham,ham
2735,ham,ham
2877,ham,ham


In [170]:
# save the model
with open('NBModel.pkl','wb') as file:
    pickle.dump(nb_model,file)

# Sentiment Analysis:
- Evaluate the sentiment of the text using four different methods: AFINN, VADER, NRC (National Research Council Emotion Lexicon), and TextBlob.

## Afinn
- Lexicon-based: AFINN uses a pre-built list of words where each word has been assigned a sentiment score between -5 (most negative) and +5 (most positive).
- It is straightforward: The sentiment score for a sentence is the sum of the individual word scores. The overall sentiment is determined by summing the values of these words.
- No handling of negation or context: AFINN does not take into account negations (e.g., “not good”) or more complex syntactic elements like context or sentence structure. So, "not bad" and "bad" would both get a similar negative score.

In [153]:
import spacy
from afinn import Afinn

nlp= spacy.load('en_core_web_sm')
afinn=Afinn()
def afinnAnalyzer(text):
    sentence=nlp(text)
    sent_score_pos = 0
    sent_score_neg = 0
    sent_score_neutral = 0
    for token in sentence:
        word = token.text
        sent_score = afinn.score(word)
            
        if sent_score > 0:
            sent_score_pos += sent_score
        elif sent_score < 0:
            sent_score_neg += sent_score
        else:
            sent_score_neutral += sent_score
    sentiment= sent_score_pos+sent_score_neg+sent_score_neutral
    if sentiment>0:
        return 'positive'
    elif sentiment<0:
        return 'Negative'
    else:
        return 'Neutral'
    return sentiment

In [159]:
sentiments=pd.DataFrame()
sentiments['Message']=emails['Message']
sentiments['Afinn']=emails['Message'].apply(afinnAnalyzer)

## vader Analysis

- Lexicon-based but designed to handle social media text: VADER uses a lexicon where words are scored between -4 and +4
- Negation: “not good” would be recognized and adjusted to reflect a more neutral sentiment, unlike AFINN.
- Punctuation and intensity: VADER increases the sentiment weight for words in all caps (“AWESOME”) and words followed by exclamation marks (“great!!!”).
#### VADER provides four types of scores
- 'neg': The negative sentiment score (ranges between 0 and 1).
- 'neu': The neutral sentiment score.
- 'pos': The positive sentiment score.
- 'compound': The overall sentiment score, which is a normalized value between -1 (most negative) and +1 (most positive).

In [100]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sahub\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [160]:
def vaderAnalyzer(text):
    vader_analyzer=SentimentIntensityAnalyzer()
    sentencescore=vader_analyzer.polarity_scores(text)
    if sentencescore['compound']>0:
        return "Positive"
    elif sentencescore['compound']<0:
        return "Negative"
    else:
        return "Neutral"

sentiments['Vader']=emails['Message'].apply(vaderAnalyzer)

## NRC (National Research Council) 

- The NRC (National Research Council) Emotion Lexicon is a lexicon-based tool that assigns emotions and sentiments to words.
- Emotions are Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust and sentiment (positive or negative)
- Unlike VADER, which directly calculates sentiment scores for entire sentences, NRC focuses on tagging individual words with emotions. 

In [120]:
!pip install nrclex
from nrclex import NRCLex

Defaulting to user installation because normal site-packages is not writeable


In [161]:
def nrc_analyzer(text):
    emotion = NRCLex(text)
    return emotion.top_emotions[0][0]

sentiments['nrc_emotions']=emails['Message'].apply(nrc_analyzer)

In [162]:
sentiments

Unnamed: 0,Message,Afinn,Vader,nrc_emotions
0,"Go until jurong point, crazy.. Available only ...",positive,Positive,anger
1,Ok lar... Joking wif u oni...,Neutral,Positive,fear
2,Free entry in 2 a wkly comp to win FA Cup fina...,positive,Positive,positive
3,U dun say so early hor... U c already then say...,Neutral,Neutral,negative
4,"Nah I don't think he goes to usf, he lives aro...",Neutral,Negative,fear
...,...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,positive,Positive,positive
5568,Will ü b going to esplanade fr home?,Neutral,Neutral,fear
5569,"Pity, * was in mood for that. So...any other s...",Negative,Negative,fear
5570,The guy did some bitching but I acted like i'd...,positive,Positive,positive


## TextBlob

- TextBlob is a simple tool for basic sentiment analysis. It works well for detecting general sentiment and subjectivity in text but doesn't handle complex emotions or slang like VADER or NRC
- Polarity ranges from -1 (very negative) to +1 (very positive). It Shows whether the message is positive or negative
- Subjectivity ranges from 0 (very objective) to 1 (very subjective). It Shows how subjective the message is, whether the text is subjective or objective)

In [125]:
!pip install textblob
from textblob import TextBlob

Defaulting to user installation because normal site-packages is not writeable


In [163]:
def blob_analyzer(text):
    blob= TextBlob(text)
    polarity=blob.sentiment.polarity
    if polarity>0:
        text='Positive'
        return pd.Series([text,blob.sentiment.subjectivity])
    elif polarity<0:
        return pd.Series(["Negative",blob.sentiment.subjectivity])
    else:
        return pd.Series(["Neutral",blob.sentiment.subjectivity])

In [164]:
sentiments[['blob_sentiment', 'blob_Subjectivity']]=emails['Message'].apply(blob_analyzer)

In [166]:
sentiments.head(10)

Unnamed: 0,Message,Afinn,Vader,nrc_emotions,blob_sentiment,blob_Subjectivity
0,"Go until jurong point, crazy.. Available only ...",positive,Positive,anger,Positive,0.7625
1,Ok lar... Joking wif u oni...,Neutral,Positive,fear,Positive,0.5
2,Free entry in 2 a wkly comp to win FA Cup fina...,positive,Positive,positive,Positive,0.55
3,U dun say so early hor... U c already then say...,Neutral,Neutral,negative,Positive,0.3
4,"Nah I don't think he goes to usf, he lives aro...",Neutral,Negative,fear,Neutral,0.0
5,FreeMsg Hey there darling it's been 3 week's n...,positive,Positive,positive,Positive,0.233333
6,Even my brother is not like to speak with me. ...,positive,Negative,positive,Neutral,0.0
7,As per your request 'Melle Melle (Oru Minnamin...,Neutral,Positive,negative,Neutral,0.0
8,WINNER!! As a valued network customer you have...,positive,Positive,positive,Neutral,1.0
9,Had your mobile 11 months or more? U R entitle...,positive,Positive,anticipation,Positive,0.75


- For Social Media or Informal Text: VADER is often the best choice because it handles slang, punctuation, and emojis.- 
For Analyzing Emotions Beyond Sentiment: NRC is useful if you’re interested in emotional categories like joy, fear, anger, etc
-  
For Speed and Simplicity: AFINN is the fastest and simplest, making it good for basic tasks where precision isn’t crucia
-  .
For General Sentiment and Subjectivity: TextBlob is a well-rounded choice, especially if you also need to perform other NLP tasks, like part-of-speech tagging or noun phrase extraction.

### The best method for sentiment analysis in this context is VADER, as it effectively handles informal language, which is common in these emails.