In [1]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
import numpy as np

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')

Read the Data

In [3]:
twitter_2013dev_a = pd.read_csv('dataset/twitter-2013dev-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2013test_a = pd.read_csv('dataset/twitter-2013test-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2013train_a = pd.read_csv('dataset/twitter-2013train-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2014sarcasm_a = pd.read_csv('dataset/twitter-2014sarcasm-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2014test_a = pd.read_csv('dataset/twitter-2014test-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2015test_a = pd.read_csv('dataset/twitter-2015test-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2015train_a = pd.read_csv('dataset/twitter-2015train-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2016dev_a = pd.read_csv('dataset/twitter-2016dev-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2016devtest_a = pd.read_csv('dataset/twitter-2016devtest-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])
twitter_2016test_a = pd.read_csv('dataset/twitter-2016test-A.txt', delimiter="\t", names=["id", "sentiment", "sentence", "dates"])
twitter_2016train_a = pd.read_csv('dataset/twitter-2016train-A.txt', delimiter="\t", names=["id", "sentiment", "sentence"])

# data_df = pd.DataFrame()
# data_df = pd.concat([twitter_2013dev_a, twitter_2013test_a, twitter_2013train_a, twitter_2014sarcasm_a, twitter_2014test_a,
#                      twitter_2015test_a, twitter_2015train_a, twitter_2016dev_a, twitter_2016devtest_a,
#                      twitter_2016test_a[["id", "sentiment", "sentence"]], twitter_2016train_a])

data_df = pd.concat([twitter_2013dev_a, twitter_2013test_a, twitter_2013train_a])
data_df.head()

Unnamed: 0,id,sentiment,sentence
0,260097528899452929,neutral,Won the match #getin . Plus\u002c tomorrow is ...
1,263791921753882624,neutral,Some areas of New England could see the first ...
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...
3,264041328420204544,neutral,#Thailand Washington - US President Barack Oba...
4,263816256640126976,neutral,Did y\u2019all hear what Tony Romo dressed up ...


Check for null values on the data

In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14885 entries, 0 to 9683
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         14885 non-null  int64 
 1   sentiment  14885 non-null  object
 2   sentence   14885 non-null  object
dtypes: int64(1), object(2)
memory usage: 465.2+ KB


In [5]:
data_df.isnull().values.any()

False

Drop the ID column and reset the indexes

In [6]:
data_df.drop("id", axis=1, inplace=True)
data_df.reset_index(inplace=True, drop=True)
data_df.head()

Unnamed: 0,sentiment,sentence
0,neutral,Won the match #getin . Plus\u002c tomorrow is ...
1,neutral,Some areas of New England could see the first ...
2,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...
3,neutral,#Thailand Washington - US President Barack Oba...
4,neutral,Did y\u2019all hear what Tony Romo dressed up ...


In [7]:
def remove_irr_char(sentence):
    """
        This function removes irrelevant characters such as punctuations and etc.
        Accepts a string sentence
        Returns a string cleaned sentence
    """
    word_list = []
    tokenized_sent = nltk.word_tokenize(sentence)
    for i in tokenized_sent:
        word_list.append(''.join(j for j in i if i.isalnum()))

    word_list = list(filter(None,  word_list))

    return ' '.join(word_list).lower()

In [8]:
data_df["cleaned_sentence"] = data_df.apply(lambda x: remove_irr_char(x[1]), axis=1)
data_df.head()

Unnamed: 0,sentiment,sentence,cleaned_sentence
0,neutral,Won the match #getin . Plus\u002c tomorrow is ...,won the match getin tomorrow is a very busy wi...
1,neutral,Some areas of New England could see the first ...,some areas of new england could see the first ...
2,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...,2nd worst qb definitely tony romo the man who ...
3,neutral,#Thailand Washington - US President Barack Oba...,thailand washington us president barack obama ...
4,neutral,Did y\u2019all hear what Tony Romo dressed up ...,did hear what tony romo dressed up as for hall...


In [9]:
def remove_stop_words(sentence):
    """
        This function removes stop words from the cleaned sentence
        Accepts a string
        Returns a string without the stop words
    """
    tokenized = nltk.word_tokenize(sentence)
    stop_words = set({"a", "about", "an", "are", "as", "at", "be", "by", "com", "de", "en", "for", "from", "how", "i", "in",
                      "is", "it", "la", "of", "on", "or", "that", "this", "to", "was", "what", "when", "where", "who", "will",
                      "with", "und", "the", "www"})

    filtered_list = [
        word for word in tokenized if word.casefold() not in stop_words
    ]

    return ' '.join(filtered_list)

In [10]:
data_df["stopwords_remove_sentence"] = data_df.apply(lambda x: remove_stop_words(x[2]), axis=1)
data_df.head()

Unnamed: 0,sentiment,sentence,cleaned_sentence,stopwords_remove_sentence
0,neutral,Won the match #getin . Plus\u002c tomorrow is ...,won the match getin tomorrow is a very busy wi...,won match getin tomorrow very busy awareness a...
1,neutral,Some areas of New England could see the first ...,some areas of new england could see the first ...,some areas new england could see first flakes ...
2,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...,2nd worst qb definitely tony romo the man who ...,2nd worst qb definitely tony romo man likes sh...
3,neutral,#Thailand Washington - US President Barack Oba...,thailand washington us president barack obama ...,thailand washington us president barack obama ...
4,neutral,Did y\u2019all hear what Tony Romo dressed up ...,did hear what tony romo dressed up as for hall...,did hear tony romo dressed up halloween giants...


In [13]:
from nltk.stem import WordNetLemmatizer

In [14]:
lemmatizer = WordNetLemmatizer()

def lemmatize(sentence):
    """
        This function lemmatize the tokens in a sentence to group different variant forms of the same word.
        Accepts a string
        Returns a string sentence in their lemmatize form
    """

    word_list = []
    tokenized_sent = nltk.word_tokenize(sentence)
    for i in tokenized_sent:
        token = lemmatizer.lemmatize(i)
        word_list.append(token)

    return ' '.join(word_list)

In [15]:
data_df["lemmatize_sentence"] = data_df.apply(lambda x: lemmatize(x[3]), axis=1)
data_df.head()

Unnamed: 0,sentiment,sentence,cleaned_sentence,stopwords_remove_sentence,lemmatize_sentence
0,neutral,Won the match #getin . Plus\u002c tomorrow is ...,won the match getin tomorrow is a very busy wi...,won match getin tomorrow very busy awareness a...,won match getin tomorrow very busy awareness a...
1,neutral,Some areas of New England could see the first ...,some areas of new england could see the first ...,some areas new england could see first flakes ...,some area new england could see first flake se...
2,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony...,2nd worst qb definitely tony romo the man who ...,2nd worst qb definitely tony romo man likes sh...,2nd worst qb definitely tony romo man like sha...
3,neutral,#Thailand Washington - US President Barack Oba...,thailand washington us president barack obama ...,thailand washington us president barack obama ...,thailand washington u president barack obama v...
4,neutral,Did y\u2019all hear what Tony Romo dressed up ...,did hear what tony romo dressed up as for hall...,did hear tony romo dressed up halloween giants...,did hear tony romo dressed up halloween giant ...


Let us check for the value distribution of the sentiment in our data

In [16]:
data_df["sentiment"].value_counts()

sentiment
neutral     6838
positive    5690
negative    2357
Name: count, dtype: int64

Let us convert the sentiment column to numerical (1 = Positive, 0 = Neutral, -1 = Negative)

In [17]:
data_df["sentiment"] = np.where((data_df["sentiment"] == "positive"), +1, data_df["sentiment"])
data_df["sentiment"] = np.where((data_df["sentiment"] == "negative"), -1, data_df["sentiment"])
data_df["sentiment"] = np.where((data_df["sentiment"] == "neutral"), 0, data_df["sentiment"])

In [18]:
data_df["sentiment"].value_counts()

sentiment
0     6838
1     5690
-1    2357
Name: count, dtype: int64

Final cleaned data

In [19]:
data_df.head()

Unnamed: 0,sentiment,sentence,cleaned_sentence,stopwords_remove_sentence,lemmatize_sentence
0,0,Won the match #getin . Plus\u002c tomorrow is ...,won the match getin tomorrow is a very busy wi...,won match getin tomorrow very busy awareness a...,won match getin tomorrow very busy awareness a...
1,0,Some areas of New England could see the first ...,some areas of new england could see the first ...,some areas new england could see first flakes ...,some area new england could see first flake se...
2,-1,@francesco_con40 2nd worst QB. DEFINITELY Tony...,2nd worst qb definitely tony romo the man who ...,2nd worst qb definitely tony romo man likes sh...,2nd worst qb definitely tony romo man like sha...
3,0,#Thailand Washington - US President Barack Oba...,thailand washington us president barack obama ...,thailand washington us president barack obama ...,thailand washington u president barack obama v...
4,0,Did y\u2019all hear what Tony Romo dressed up ...,did hear what tony romo dressed up as for hall...,did hear tony romo dressed up halloween giants...,did hear tony romo dressed up halloween giant ...


### Logistic Regression Model without feature engineering using  CountVectorizer

In [20]:
data_mod = data_df[["sentiment", "cleaned_sentence", "stopwords_remove_sentence", "lemmatize_sentence"]]
data_mod.head()

Unnamed: 0,sentiment,cleaned_sentence,stopwords_remove_sentence,lemmatize_sentence
0,0,won the match getin tomorrow is a very busy wi...,won match getin tomorrow very busy awareness a...,won match getin tomorrow very busy awareness a...
1,0,some areas of new england could see the first ...,some areas new england could see first flakes ...,some area new england could see first flake se...
2,-1,2nd worst qb definitely tony romo the man who ...,2nd worst qb definitely tony romo man likes sh...,2nd worst qb definitely tony romo man like sha...
3,0,thailand washington us president barack obama ...,thailand washington us president barack obama ...,thailand washington u president barack obama v...
4,0,did hear what tony romo dressed up as for hall...,did hear tony romo dressed up halloween giants...,did hear tony romo dressed up halloween giant ...


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

Create a bag of words using count vectorizer

In [22]:
vectorizer_count = CountVectorizer(
    min_df = 3,
    ngram_range = (1, 1)
)

vectorizer_count2 = CountVectorizer(
    min_df = 3,
    ngram_range = (1, 1)
)

vectorizer_count3 = CountVectorizer(
    min_df = 3,
    ngram_range = (1, 1)
)

In [23]:
features_lemma = vectorizer_count.fit_transform(
    data_mod["lemmatize_sentence"]
)

features_lemma_nd = features_lemma.toarray()

features_stopwords = vectorizer_count2.fit_transform(
    data_mod["stopwords_remove_sentence"]
)

features_stopwords_nd = features_stopwords.toarray()

features_cleaned = vectorizer_count3.fit_transform(
    data_mod["cleaned_sentence"]
)

features_cleaned_nd = features_cleaned.toarray()

In [24]:
data_labels = []
for i in data_mod["sentiment"]:
    data_labels.append(i)

In [25]:
from sklearn.model_selection import train_test_split

X_train_lemma_cv, X_test_lemma_cv, y_train_lemma_cv, y_test_lemma_cv = train_test_split(
    features_lemma_nd,
    data_labels,
    train_size=0.80,
    random_state=10
)

X_train_stopwords_cv, X_test_stopwords_cv, y_train_stopwords_cv, y_test_stopwords_cv = train_test_split(
    features_stopwords_nd,
    data_labels,
    train_size=0.80,
    random_state=10
)

X_train_cleaned_cv, X_test_cleaned_cv, y_train_cleaned_cv, y_test_cleaned_cv = train_test_split(
    features_cleaned_nd,
    data_labels,
    train_size=0.80,
    random_state=10
)

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
log_model_1 = LogisticRegression(max_iter=1000)
log_model_2 = LogisticRegression(max_iter=1000)
log_model_3 = LogisticRegression(max_iter=1000)

In [28]:
log_model_1 = log_model_1.fit(X_train_lemma_cv, y_train_lemma_cv)
log_model_2 = log_model_2.fit(X_train_stopwords_cv, y_train_stopwords_cv)
log_model_3 = log_model_3.fit(X_train_cleaned_cv, y_train_cleaned_cv)

In [29]:
y_pred_cv1 = log_model_1.predict(X_test_lemma_cv)
y_pred_cv2 = log_model_2.predict(X_test_stopwords_cv)
y_pred_cv3 = log_model_3.predict(X_test_cleaned_cv)

In [30]:
from sklearn.metrics import accuracy_score, classification_report

In [31]:
# Model 1
print(accuracy_score(y_test_lemma_cv, y_pred_cv1))
print(classification_report(y_test_lemma_cv, y_pred_cv1))

0.663419549882432
              precision    recall  f1-score   support

          -1       0.57      0.42      0.48       483
           0       0.67      0.74      0.71      1361
           1       0.68      0.67      0.68      1133

    accuracy                           0.66      2977
   macro avg       0.64      0.61      0.62      2977
weighted avg       0.66      0.66      0.66      2977



In [32]:
# Model 2
print(accuracy_score(y_test_stopwords_cv, y_pred_cv2))
print(classification_report(y_test_stopwords_cv, y_pred_cv2))

0.6627477326167283
              precision    recall  f1-score   support

          -1       0.57      0.42      0.49       483
           0       0.67      0.75      0.71      1361
           1       0.69      0.66      0.67      1133

    accuracy                           0.66      2977
   macro avg       0.64      0.61      0.62      2977
weighted avg       0.66      0.66      0.66      2977



In [33]:
# Model 3
print(accuracy_score(y_test_cleaned_cv, y_pred_cv3))
print(classification_report(y_test_cleaned_cv, y_pred_cv3))

0.6677863621095063
              precision    recall  f1-score   support

          -1       0.60      0.43      0.50       483
           0       0.67      0.75      0.71      1361
           1       0.69      0.67      0.68      1133

    accuracy                           0.67      2977
   macro avg       0.65      0.62      0.63      2977
weighted avg       0.66      0.67      0.66      2977



### Logistic Regression Model without feature engineering using TFIDF Vectorizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
vectorizer_tfidf = TfidfVectorizer(min_df=3)

vectorizer_tfidf2 = TfidfVectorizer(min_df=3)

vectorizer_tfidf3 = TfidfVectorizer(min_df=3)

In [36]:
features_lemma_tfidf = vectorizer_tfidf.fit_transform(data_mod["lemmatize_sentence"])

features_lemma_tfidf_nd = features_lemma_tfidf.toarray()

features_stopwords_tfidf = vectorizer_tfidf2.fit_transform(data_mod["stopwords_remove_sentence"])

features_stopwords_tfidf_nd = features_stopwords_tfidf.toarray()

features_cleaned_tfidf = vectorizer_tfidf3.fit_transform(data_mod["cleaned_sentence"])

features_cleaned_tfidf_nd = features_cleaned_tfidf.toarray()


In [37]:
X_train_lemma_tfidf, X_test_lemma_tfidf, y_train_lemma_tfidf, y_test_lemma_tfidf  = train_test_split(
        features_lemma_tfidf_nd, 
        data_labels,
        train_size=0.80, 
        random_state=10)

X_train_stopwords_tfidf, X_test_stopwords_tfidf, y_train_stopwords_tfidf, y_test_stopwords_tfidf  = train_test_split(
        features_stopwords_tfidf_nd, 
        data_labels,
        train_size=0.80, 
        random_state=10)

X_train_cleaned_tfidf, X_test_cleaned_tfidf, y_train_cleaned_tfidf, y_test_cleaned_tfidf  = train_test_split(
        features_cleaned_tfidf_nd, 
        data_labels,
        train_size=0.80, 
        random_state=10)

In [38]:
log_model_4 =  LogisticRegression(max_iter=1000)
log_model_5 =  LogisticRegression(max_iter=1000)
log_model_6 =  LogisticRegression(max_iter=1000)

In [39]:
log_model_4 = log_model_4.fit(X_train_lemma_tfidf, y_train_lemma_tfidf)
log_model_5 = log_model_5.fit(X_train_stopwords_tfidf, y_train_stopwords_tfidf)
log_model_6 = log_model_6.fit(X_train_cleaned_tfidf, y_train_cleaned_tfidf)

In [40]:
y_pred_stemmed_tfidf = log_model_4.predict(X_test_lemma_tfidf)
print(accuracy_score(y_test_lemma_tfidf, y_pred_stemmed_tfidf))
print(classification_report(y_test_lemma_tfidf, y_pred_stemmed_tfidf))

0.6711454484380248
              precision    recall  f1-score   support

          -1       0.69      0.30      0.42       483
           0       0.65      0.82      0.72      1361
           1       0.71      0.65      0.68      1133

    accuracy                           0.67      2977
   macro avg       0.68      0.59      0.61      2977
weighted avg       0.68      0.67      0.66      2977



In [41]:
y_pred_stopwords_tfidf = log_model_5.predict(X_test_stopwords_tfidf)
print(accuracy_score(y_test_stopwords_tfidf, y_pred_stopwords_tfidf))
print(classification_report(y_test_stopwords_tfidf, y_pred_stopwords_tfidf))

0.6741686261336917
              precision    recall  f1-score   support

          -1       0.70      0.30      0.42       483
           0       0.65      0.83      0.73      1361
           1       0.72      0.65      0.68      1133

    accuracy                           0.67      2977
   macro avg       0.69      0.59      0.61      2977
weighted avg       0.68      0.67      0.66      2977



In [42]:
y_pred_cleaned_tfidf = log_model_6.predict(X_test_cleaned_tfidf)
print(accuracy_score(y_test_cleaned_tfidf, y_pred_cleaned_tfidf))
print(classification_report(y_test_cleaned_tfidf, y_pred_cleaned_tfidf))

0.6741686261336917
              precision    recall  f1-score   support

          -1       0.69      0.31      0.43       483
           0       0.65      0.82      0.73      1361
           1       0.71      0.65      0.68      1133

    accuracy                           0.67      2977
   macro avg       0.68      0.60      0.61      2977
weighted avg       0.68      0.67      0.66      2977



### Naive Bayes Model without Feature Engineering

In [43]:
from sklearn.naive_bayes import MultinomialNB

#### Using CountVectorizer

In [44]:
multinomialNB = MultinomialNB(force_alpha=True)

multinomialNB.fit(X_train_stopwords_cv, y_train_stopwords_cv)

In [88]:
y_pred_nb_cv = multinomialNB.predict(X_test_stopwords_cv)
print(accuracy_score(y_test_stopwords_cv, y_pred_nb_cv))
print(classification_report(y_test_stopwords_cv, y_pred_nb_cv))

0.6341954988243198
              precision    recall  f1-score   support

          -1       0.51      0.49      0.50       483
           0       0.69      0.63      0.66      1361
           1       0.63      0.70      0.66      1133

    accuracy                           0.63      2977
   macro avg       0.61      0.61      0.61      2977
weighted avg       0.64      0.63      0.63      2977



In [46]:
multinomialNB2 = MultinomialNB(force_alpha=True)

multinomialNB2.fit(X_train_lemma_cv, y_train_lemma_cv)

In [89]:
y_pred_nb_cv2 = multinomialNB2.predict(X_test_lemma_cv)
print(accuracy_score(y_test_lemma_cv, y_pred_nb_cv2))
print(classification_report(y_test_lemma_cv, y_pred_nb_cv2))

0.6348673160900236
              precision    recall  f1-score   support

          -1       0.52      0.49      0.51       483
           0       0.68      0.63      0.66      1361
           1       0.63      0.70      0.66      1133

    accuracy                           0.63      2977
   macro avg       0.61      0.61      0.61      2977
weighted avg       0.64      0.63      0.63      2977



Using TF-IDF Vectorizer

In [90]:
multinomialNB3 = MultinomialNB(force_alpha=True)

multinomialNB3.fit(X_train_stopwords_tfidf, y_train_stopwords_tfidf)

y_pred_nb_tfidf = multinomialNB3.predict(X_test_stopwords_tfidf)
print(accuracy_score(y_test_stopwords_tfidf, y_pred_nb_tfidf))
print(classification_report(y_test_stopwords_tfidf, y_pred_nb_tfidf))

0.6167282499160228
              precision    recall  f1-score   support

          -1       0.85      0.06      0.11       483
           0       0.62      0.77      0.69      1361
           1       0.61      0.67      0.64      1133

    accuracy                           0.62      2977
   macro avg       0.69      0.50      0.48      2977
weighted avg       0.65      0.62      0.57      2977



In [91]:
multinomialNB4 = MultinomialNB(force_alpha=True)

multinomialNB4.fit(X_train_lemma_tfidf, y_train_lemma_tfidf)

y_pred_nb_tfidf2 = multinomialNB4.predict(X_test_lemma_tfidf)
print(accuracy_score(y_test_lemma_tfidf, y_pred_nb_tfidf2))
print(classification_report(y_test_lemma_tfidf, y_pred_nb_tfidf2))

0.6167282499160228
              precision    recall  f1-score   support

          -1       0.86      0.06      0.12       483
           0       0.62      0.77      0.69      1361
           1       0.60      0.67      0.64      1133

    accuracy                           0.62      2977
   macro avg       0.69      0.50      0.48      2977
weighted avg       0.65      0.62      0.57      2977



### Logistic Regression Models with Oversampling

In [50]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [51]:
oversample = RandomOverSampler(sampling_strategy='minority')

X_over, y_over = oversample.fit_resample(features_stopwords_nd, data_labels)
X_over, y_over = oversample.fit_resample(X_over, y_over)
print(Counter(y_over))

Counter({0: 6838, -1: 6838, 1: 6838})


In [52]:
X_train_os, X_test_os, y_train_os, y_test_os  = train_test_split(
        X_over, 
        y_over,
        train_size=0.80, 
        random_state=10)

In [53]:
log_model_7 = LogisticRegression(max_iter=1000)
log_model_7 = log_model_7.fit(X_train_os, y_train_os)

In [54]:
y_pred = log_model_7.predict(X_test_os)
print(accuracy_score(y_test_os, y_pred))
print(classification_report(y_test_os, y_pred))

0.7760175481355106
              precision    recall  f1-score   support

          -1       0.81      0.91      0.85      1411
           0       0.72      0.69      0.70      1318
           1       0.80      0.72      0.76      1374

    accuracy                           0.78      4103
   macro avg       0.77      0.77      0.77      4103
weighted avg       0.77      0.78      0.77      4103



In [55]:
oversample2 = RandomOverSampler(sampling_strategy='minority')

X_over2, y_over2 = oversample2.fit_resample(features_lemma_nd, data_labels)
X_over2, y_over2 = oversample2.fit_resample(X_over2, y_over2)
print(Counter(y_over2))

Counter({0: 6838, -1: 6838, 1: 6838})


In [56]:
X_train_os2, X_test_os2, y_train_os2, y_test_os2  = train_test_split(
        X_over2, 
        y_over2,
        train_size=0.80, 
        random_state=10)

In [57]:
log_model_8 = LogisticRegression(max_iter=1000)
log_model_8 = log_model_8.fit(X_train_os2, y_train_os2)

In [58]:
y_pred2 = log_model_8.predict(X_test_os2)
print(accuracy_score(y_test_os2, y_pred2))
print(classification_report(y_test_os2, y_pred2))

0.7679746526931513
              precision    recall  f1-score   support

          -1       0.81      0.89      0.85      1411
           0       0.72      0.69      0.70      1318
           1       0.77      0.72      0.74      1374

    accuracy                           0.77      4103
   macro avg       0.76      0.77      0.76      4103
weighted avg       0.77      0.77      0.77      4103



In [64]:
oversample3 = RandomOverSampler(sampling_strategy='minority')

X_over3, y_over3 = oversample3.fit_resample(features_lemma_tfidf_nd, data_labels)
X_over3, y_over3 = oversample.fit_resample(X_over3, y_over3)
print(Counter(y_over3))

Counter({0: 6838, -1: 6838, 1: 6838})


In [65]:
X_train_os3, X_test_os3, y_train_os3, y_test_os3  = train_test_split(
        X_over3, 
        y_over3,
        train_size=0.80, 
        random_state=10)

In [93]:
log_model_9 = LogisticRegression(max_iter=1000)
log_model_9 = log_model_9.fit(X_train_os3, y_train_os3)

In [94]:
y_pred3 = log_model_9.predict(X_test_os3)
print(accuracy_score(y_test_os3, y_pred3))
print(classification_report(y_test_os3, y_pred3))

0.7433585181574458
              precision    recall  f1-score   support

          -1       0.77      0.85      0.81      1411
           0       0.68      0.71      0.70      1318
           1       0.78      0.67      0.72      1374

    accuracy                           0.74      4103
   macro avg       0.74      0.74      0.74      4103
weighted avg       0.75      0.74      0.74      4103



In [68]:
oversample4 = RandomOverSampler(sampling_strategy='minority')

X_over4, y_over4 = oversample4.fit_resample(features_stopwords_tfidf_nd, data_labels)
X_over4, y_over4 = oversample.fit_resample(X_over4, y_over4)
print(Counter(y_over4))

Counter({0: 6838, -1: 6838, 1: 6838})


In [69]:
X_train_os4, X_test_os4, y_train_os4, y_test_os4  = train_test_split(
        X_over4, 
        y_over4,
        train_size=0.80, 
        random_state=10)

In [95]:
log_model_10 = LogisticRegression(max_iter=1000)
log_model_10 = log_model_10.fit(X_train_os4, y_train_os4)

In [96]:
y_pred4 = log_model_10.predict(X_test_os4)
print(accuracy_score(y_test_os4, y_pred4))
print(classification_report(y_test_os4, y_pred4))

0.7421398976358762
              precision    recall  f1-score   support

          -1       0.79      0.83      0.81      1411
           0       0.67      0.71      0.69      1318
           1       0.76      0.68      0.72      1374

    accuracy                           0.74      4103
   macro avg       0.74      0.74      0.74      4103
weighted avg       0.74      0.74      0.74      4103



### Support Vector Machine

In [59]:
from sklearn import svm

In [60]:
svm_1 = svm.SVC()
svm_1.fit(X_train_os, y_train_os)

In [61]:
y_pred_svm = svm_1.predict(X_test_os)
print(accuracy_score(y_test_os, y_pred_svm))
print(classification_report(y_test_os, y_pred_svm))

0.7942968559590543
              precision    recall  f1-score   support

          -1       0.87      0.90      0.88      1411
           0       0.68      0.81      0.74      1318
           1       0.86      0.67      0.75      1374

    accuracy                           0.79      4103
   macro avg       0.80      0.79      0.79      4103
weighted avg       0.81      0.79      0.79      4103



In [62]:
svm_2 = svm.SVC()
svm_2.fit(X_train_os2, y_train_os2)

In [63]:
y_pred_svm2 = svm_2.predict(X_test_os2)
print(accuracy_score(y_test_os2, y_pred_svm2))
print(classification_report(y_test_os2, y_pred_svm2))

0.8025834755057275
              precision    recall  f1-score   support

          -1       0.88      0.91      0.89      1411
           0       0.70      0.82      0.75      1318
           1       0.84      0.68      0.75      1374

    accuracy                           0.80      4103
   macro avg       0.81      0.80      0.80      4103
weighted avg       0.81      0.80      0.80      4103



In [66]:
svm_3 = svm.SVC()
svm_3.fit(X_train_os3, y_train_os3)

In [67]:
y_pred_svm3 = svm_3.predict(X_test_os3)
print(accuracy_score(y_test_os3, y_pred_svm3))
print(classification_report(y_test_os3, y_pred_svm3))

0.821593955642213
              precision    recall  f1-score   support

          -1       0.89      0.93      0.91      1411
           0       0.73      0.83      0.77      1318
           1       0.85      0.70      0.77      1374

    accuracy                           0.82      4103
   macro avg       0.82      0.82      0.82      4103
weighted avg       0.83      0.82      0.82      4103



In [70]:
svm_4 = svm.SVC()
svm_4.fit(X_train_os4, y_train_os4)

In [71]:
y_pred_svm4 = svm_4.predict(X_test_os4)
print(accuracy_score(y_test_os4, y_pred_svm4))
print(classification_report(y_test_os4, y_pred_svm4))

0.810138922739459
              precision    recall  f1-score   support

          -1       0.90      0.91      0.91      1411
           0       0.70      0.82      0.76      1318
           1       0.84      0.70      0.76      1374

    accuracy                           0.81      4103
   macro avg       0.81      0.81      0.81      4103
weighted avg       0.82      0.81      0.81      4103



### Naive Bayes with Oversampling

In [74]:
multinomialNB_os = MultinomialNB(force_alpha=True)

multinomialNB_os.fit(X_train_os, y_train_os)

In [75]:
y_pred_nb_os = multinomialNB_os.predict(X_test_os)
print(accuracy_score(y_test_os, y_pred_nb_os))
print(classification_report(y_test_os, y_pred_nb_os))

0.7067999025103583
              precision    recall  f1-score   support

          -1       0.74      0.84      0.78      1411
           0       0.68      0.58      0.62      1318
           1       0.70      0.70      0.70      1374

    accuracy                           0.71      4103
   macro avg       0.70      0.70      0.70      4103
weighted avg       0.70      0.71      0.70      4103



In [76]:
multinomialNB_os2 = MultinomialNB(force_alpha=True)

multinomialNB_os2.fit(X_train_os2, y_train_os2)

In [77]:
y_pred_nb_os2 = multinomialNB_os2.predict(X_test_os2)
print(accuracy_score(y_test_os2, y_pred_nb_os2))
print(classification_report(y_test_os2, y_pred_nb_os2))

0.7029003168413356
              precision    recall  f1-score   support

          -1       0.74      0.81      0.77      1411
           0       0.67      0.59      0.63      1318
           1       0.69      0.70      0.70      1374

    accuracy                           0.70      4103
   macro avg       0.70      0.70      0.70      4103
weighted avg       0.70      0.70      0.70      4103



### Saving models to a File Format using JobLib

In [72]:
import joblib

In [73]:
model_filename = "svm_lemma_os_tfidf.joblib"
vectorizer_filename = "lemma_tfidf_vectorizer.joblib"
joblib.dump(svm_3, "nlp_project/models/" + model_filename)
joblib.dump(vectorizer_tfidf, "nlp_project/models/" + vectorizer_filename)

['nlp_project/models/lemma_tfidf_vectorizer.joblib']

In [None]:
model_filename = "svm_os_tfidf.joblib"
joblib.dump(svm_2, "nlp_project/models/" + model_filename)