In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold
from collections import Counter

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ahlaam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
np.random.seed(42)

In [3]:
url_data = "https://github.com/ahlraf/btp-transfer-learning/blob/master/processed-data/suicide_vs_depression.csv?raw=true"
df = pd.read_csv(url_data, header='infer', skip_blank_lines=True, encoding="utf-8")

In [5]:
# preprocessing
df['text'].dropna(inplace=True)

df["text"] = [x.lower() for x in df["text"]]  # lowering
df["text"] = [word_tokenize(x) for x in df["text"]]  # tokenization

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(df['text']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # check for stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
    # final processed set of words for each iteration stored in 'text_final'
    df.loc[index,'text_final'] = str(final_words)

In [6]:
# 5 fold cross validation
from sklearn.model_selection import KFold, StratifiedKFold

n = 5
skf = StratifiedKFold(n_splits=n)
x = df["text_final"].to_numpy()
y = df["label"].to_numpy()

folds = []

for train_index, test_index in skf.split(x,y):
    x_train, y_train = x[train_index], y[train_index]
    x_test, y_test = x[test_index], y[test_index]

    folds.append([x_train, x_test, y_train, y_test])

In [13]:
encoder = LabelEncoder()

In [18]:
print("NAIVE BAYES PERFORMANCE:\n")
for fold_i in range(len(folds)):
    print("\n------------------\nFold", fold_i+1, "\n")
    train_x, test_x, train_y, test_y = folds[fold_i][0], folds[fold_i][1], folds[fold_i][2], folds[fold_i][3]
    print(Counter(train_y))
    train_y = encoder.fit_transform(train_y)
    test_y = encoder.fit_transform(test_y)
    tfidf_vect = TfidfVectorizer(max_features=5000)
    tfidf_vect.fit(df['text_final'])
    train_x_tfidf = tfidf_vect.transform(train_x)
    test_x_tfidf = tfidf_vect.transform(test_x)

    # fit the training dataset on the NB classifier
    nb = naive_bayes.MultinomialNB()
    nb.fit(train_x_tfidf,train_y)
    # predict the labels on validation dataset
    predictions_nb = nb.predict(test_x_tfidf)

    print("Classification report:\n", classification_report(predictions_nb, test_y))


NAIVE BAYES PERFORMANCE:


------------------
Fold 1 

Counter({0: 784, 1: 779})
Classification report:
               precision    recall  f1-score   support

           0       0.83      0.73      0.78       222
           1       0.70      0.80      0.75       169

    accuracy                           0.76       391
   macro avg       0.76      0.77      0.76       391
weighted avg       0.77      0.76      0.77       391


------------------
Fold 2 

Counter({0: 784, 1: 779})
Classification report:
               precision    recall  f1-score   support

           0       0.69      0.73      0.71       186
           1       0.74      0.71      0.72       205

    accuracy                           0.72       391
   macro avg       0.72      0.72      0.72       391
weighted avg       0.72      0.72      0.72       391


------------------
Fold 3 

Counter({0: 784, 1: 779})
Classification report:
               precision    recall  f1-score   support

           0       0.83     

In [19]:
print("--------------------------------------")
print("\nSVM PERFORMANCE:\n")

for fold_i in range(len(folds)):
    print("\n------------------\nFold", fold_i+1, "\n")
    train_x, test_x, train_y, test_y = folds[fold_i][0], folds[fold_i][1], folds[fold_i][2], folds[fold_i][3]
    print(Counter(train_y))

    train_y = encoder.fit_transform(train_y)
    test_y = encoder.fit_transform(test_y)
    tfidf_vect = TfidfVectorizer(max_features=5000)
    tfidf_vect.fit(df['text_final'])
    train_x_tfidf = tfidf_vect.transform(train_x)
    test_x_tfidf = tfidf_vect.transform(test_x)

    # fit the training dataset on the classifier
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(train_x_tfidf, train_y)
    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(test_x_tfidf)
    # Use accuracy_score function to get the accuracy
    print("SVM performance:")
    print("\nClassification report:\n", classification_report(predictions_SVM, test_y))

--------------------------------------

SVM PERFORMANCE:


------------------
Fold 1 

Counter({0: 784, 1: 779})
SVM performance:

Classification report:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74       195
           1       0.74      0.74      0.74       196

    accuracy                           0.74       391
   macro avg       0.74      0.74      0.74       391
weighted avg       0.74      0.74      0.74       391


------------------
Fold 2 

Counter({0: 784, 1: 779})
SVM performance:

Classification report:
               precision    recall  f1-score   support

           0       0.66      0.73      0.70       177
           1       0.76      0.69      0.72       214

    accuracy                           0.71       391
   macro avg       0.71      0.71      0.71       391
weighted avg       0.72      0.71      0.71       391


------------------
Fold 3 

Counter({0: 784, 1: 779})
SVM performance:

Classification report