In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/z1xtr/Downloads/Naive_bayes/Naive_bayes/tweets.csv")

In [None]:
df.head()

In [None]:
tweet = df.tweet

In [None]:
label = df.label

In [None]:
tweet.head()

In [None]:
label.head()

In [None]:
import string
string.punctuation

In [None]:
import re
pattern = '[{}]'.format(re.escape(string.punctuation))
pattern

In [None]:
import nltk
stw = nltk.corpus.stopwords.words('english')
print(stw)

# Defining a function for text pre-processing

In [None]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

# converting the text into all lower cases

def text_lower(text):
    tlower = pd.Series([sents.lower() for sents in text])
    return tlower

# word tokenization

def word_tkns(text):
    tkns = pd.Series([nltk.tokenize.word_tokenize(sents) for sents in text])
    return tkns

# removing punctuation

def punct_removal(text):
    punct_pattern = '[{}]'.format(re.escape(string.punctuation))
    regex_pattern = re.compile(punct_pattern)
    clean_sents1= pd.Series([list(filter(None, [regex_pattern.sub('',  x) for x in sents])) for sents in text])
    return clean_sents1

# stopword removal
# remember to not remove negations in the case of 

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    clean_sents2 = pd.Series([[x for x in sents if x not in stopwords] for sents in text])
    return clean_sents2


## correcting words like spelling mistakes, repeated characters, etc. 

from nltk.corpus import wordnet as wn
def remove_repeated_character(text):
    
    # pattern that occur twice among other characters
    pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    replacement = r'\1\2\3'
    
    def replace(tkn):
        
        # check semantically correct word if not replace and check again
        if wn.synsets(tkn):
            return tkn
        
        tkn_c = pattern.sub(replacement, tkn)
        
        # recursive call
        return replace(tkn_c) if tkn_c != tkn else tkn_c
    
     # correct each token 
    token_c = pd.Series([[replace(tn) for tn in sents] for sents in text])
    return token_c

## PoS tagging

def pos_tag_text(text):
    
    tkn_tagged = pd.Series([nltk.pos_tag(sents, tagset= 'universal') for sents in text])
    
    def penn_to_wn(ptag):
        
        if ptag.startswith('N'):
            return wn.NOUN
        if ptag.startswith('V'):
            return wn.VERB
        if ptag.startswith('J'):
            return wn.ADJ
        if ptag.startswith('R'):
            return wn.ADV
        else:
            return None 
    
    tkn_tagged_wn = pd.Series([[(token, penn_to_wn(tag)) for token, tag in sents] for sents in tkn_tagged])
    return tkn_tagged_wn
                             
## Lemmatization using WordNetLemmatizer

def wnl(text):
    wnlemmatizer = WordNetLemmatizer()
    stemmed_text = pd.Series([[wnlemmatizer.lemmatize(words, tag) if tag else wnlemmatizer.lemmatize(words) for words, tag in sents ] for sents in text])
    return stemmed_text                          

# Converting earch element in the series from a list to string

def list_to_string(text):
    str_series = pd.Series([' '.join(sents) for sents in text])
    return str_series
    
# Defining a function for text pre-processing

def preprocessed_text(text):
    
    tl = text_lower(text)
    wtkns = word_tkns(tl)
    punc_removal = punct_removal(wtkns)
    remove_stw = remove_stopwords(punc_removal)
    removing_extra_characters = remove_repeated_character(remove_stw) 
    wn_tagged = pos_tag_text(removing_extra_characters)
    lemmatized_text = wnl(wn_tagged)
    preprocessed_text= list_to_string(lemmatized_text)
    return preprocessed_text



## Testing the function preprocessed_text(text) on tweets

In [None]:
tweet_c = preprocessed_text(tweet)
print(tweet_c[0:5])

# FEATURE EXTRACTION USING TF-IDF MODEL

### Creating train and test partition

In [None]:
from sklearn.model_selection import train_test_split
tweet_train, tweet_test, label_train, label_test = train_test_split(tweet_c, label, test_size= 0.3, random_state=42, shuffle=True)

In [None]:
tweet_train.head()

In [None]:
label_train.head()

# Extracting features using TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# creating an instance of TfidfVectorizer
tfidfvectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, use_idf=True)

# using fit_transform method to extract the features from tweet-train
features_train_t = tfidfvectorizer.fit_transform(tweet_train)
features_train_tm = features_train_t.toarray()

# tranforming the tweet-test
features_test_t = tfidfvectorizer.transform(tweet_test)
features_test_tm = features_test_t.toarray()

# get feature names
feature_names_t = tfidfvectorizer.get_feature_names_out()
print(feature_names_t[:50])

In [None]:
# creating dataframe
df_train_t = pd.DataFrame(data=features_train_tm, columns= feature_names_t)
df_test_t = pd.DataFrame(data=features_test_tm, columns= feature_names_t)

print(df_train_t.iloc[0:5, 0:10])
print(df_test_t.iloc[0:5, 0:10])

## Implement Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# create an instance of MultinomialNB()
multinomialnb = MultinomialNB()

# train the model
multinomialnb.fit(features_train_t, label_train)
# make_predictions
predictions_t = multinomialnb.predict(features_test_t)

# check accuracy
as_t = accuracy_score(label_test, predictions_t)
print('accuracy score:', as_t)

# classification matrix

In [None]:
#classification matrix
from sklearn import metrics
cm_multiNB = pd.DataFrame(data= metrics.confusion_matrix(label_test, predictions_t))
cm_multiNB

In [None]:
# total 359 wrong prediction for class 1
df_predictions_check = pd.DataFrame({'tweet' : tweet_test, 'predicted_label' : predictions_t, 'label_t' : label_test})
df_predictions_check[df_predictions_check['label_t'] == 1][df_predictions_check['predicted_label'] == 0]

In [None]:
df.iloc[7539]

In [None]:
df.iloc[554]

## Implement Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
bernoullinb2 = BernoulliNB()
bernoullinb2.fit(features_train_t, label_train)
predictions_bnb2 = bernoullinb2.predict(features_test_t)

as_bnb2 = accuracy_score(label_test, predictions_bnb2)
print('accuracy score:', as_bnb2)

## classification matrix

In [None]:
pd.DataFrame(data= metrics.confusion_matrix(label_test, predictions_bnb2))

# Support Vector Machine (SVM) classifier

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
svm1 = SGDClassifier()
svm1.fit(features_train_t, label_train)
predictions_svm1 = svm1.predict(features_test_t)

print('accuracy score:', metrics.accuracy_score(label_test, predictions_svm1), '\nprecision score:', metrics.precision_score(label_test, predictions_svm1))

## classification matrix

In [None]:
pd.DataFrame(data= metrics.confusion_matrix(label_test, predictions_svm1))