Importing the Dependencies Libraries

In [1]:
!pip install nltk




[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef

Printing List of the Stopwords in English

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-Processing

In [4]:
# Load Dataset from CSV
Stres_Data = pd.read_csv('D:\\Semester 4\\Natural Language Processing\\Lab Praktikum\\Research Methodology Project\\Stress.csv', encoding='ISO-8859-1')
Stres_Data.shape

(2838, 7)

In [5]:
Stres_Data.head()

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005


In [6]:
# Check Missing Value
Stres_Data.isnull().sum()

subreddit           0
post_id             0
sentence_range      0
text                0
label               0
confidence          0
social_timestamp    0
dtype: int64

In [7]:
# Checking Distribution of label index [Stres Level Index]
Stres_Data['label'].value_counts()

label
1    1488
0    1350
Name: count, dtype: int64

0 --> Non-Stres Text

1 --> Stres Text

The Data Imbalanced, so we need to do Upsampling or Downsampling

In [8]:
# UpSampling
from sklearn.utils import resample

One_Sample = Stres_Data[Stres_Data['label'] == 1]
Zero_Sample = Stres_Data[Stres_Data['label'] == 0]
print("Before Resample [UpSampling]")
print(One_Sample.shape)
print(Zero_Sample.shape)

Zero_Sample = resample(Zero_Sample, replace=True, n_samples=len(One_Sample), random_state=42)
print("After Resample [UpSampling]")
print(One_Sample.shape)
print(Zero_Sample.shape)

Before Resample [UpSampling]
(1488, 7)
(1350, 7)
After Resample [UpSampling]
(1488, 7)
(1488, 7)


In [9]:
df_1 = pd.DataFrame(One_Sample)
df_2 = pd.DataFrame(Zero_Sample)

Stres_Data = pd.concat([df_1, df_2])
Stres_Data['label'].value_counts()

label
1    1488
0    1488
Name: count, dtype: int64

Stemming --> Process to Reduce a word into its root word

Example like
actor, actress, acting --> act

In [10]:
Porter_Stemmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # Matching Pattern by removing everything except a-z and A-Z
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [Porter_Stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [11]:
Stres_Data['Stemmed_Text'] = Stres_Data['text'].apply(stemming)

In [12]:
Stres_Data.head()

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp,Stemmed_Text
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353,said felt way sugget go rest trigger ahead you...
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605,mom hit newspap shock would know like play hit...
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555,met new boyfriend amaz kind sweet good student...
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005,octob domest violenc awar month domest violenc...
5,relationships,7tx7et,"(30, 35)",I think he doesn't want to put in the effort f...,1,1.0,1517274027,think want put effort relationship work diffic...


In [13]:
# Separating Text and Label
X = Stres_Data['Stemmed_Text'].values
Y = Stres_Data['label'].values

print(X)
print(Y)

['said felt way sugget go rest trigger ahead youi hypocondriac like decid look feel doom hope mayb get suck rabbit hole ludicr conspiraci stupid psychic test new age b someth could even laugh road end read sens doom indic variou health ailment one prone top doom gloom f n worri heart happen physic hour'
 'mom hit newspap shock would know like play hit smack strike hit violenc sort person send vibe ask univers yesterday decid take friend go help anoth friend move new place drive friend move strike shoulder address immedi th time told thing friend drive nearli get collis anoth car think high marijuana friend move backseat like understand tri get attent know thing year old get peopl attent smack guy'
 'met new boyfriend amaz kind sweet good student like thing famili like dont feel passion rush felt ex truth start go boyfriend secretli saw ex time see realli didnt feel noth disgust didnt even want touch feel bad didnt want still kinda realiz felt noth love relat ok hurt knew date boy even 

Splitting the data to training data and test data

In [14]:
X_Train, X_test, Y_Train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

Converting Textual Data into Numerical Data

In [15]:
Vectorizer = TfidfVectorizer()

X_String = X_test
X_Train = Vectorizer.fit_transform(X_Train)
X_test = Vectorizer.transform(X_test)

Training The Machine Learning Model [Logistic Regression, Random Forest, Decision Tree]

In [17]:
model = LogisticRegression(max_iter=1000)
model1 = RandomForestClassifier()
model2 = DecisionTreeClassifier()

In [18]:
model.fit(X_Train, Y_Train)
model1.fit(X_Train, Y_Train)
model2.fit(X_Train, Y_Train)

Model Evaluation

Logistic Regression

In [19]:
X_Train_Prediction = model.predict(X_Train)
Training_Data_Accuracy = accuracy_score(Y_Train, X_Train_Prediction)
print('Accuracy Score on the Training Data: ', Training_Data_Accuracy)

Accuracy Score on the Training Data:  0.9155462184873949


In [None]:
X_Test_Prediction = model.predict(X_test)
Test_Data_Accuracy = accuracy_score(Y_test, X_Test_Prediction)
print('Accuracy Score on the Test Data: ', Test_Data_Accuracy)

Accuracy Score on the Test Data:  0.7818791946308725


In [None]:
print(classification_report(Y_test, X_Test_Prediction))
print(confusion_matrix(Y_test, X_Test_Prediction))
print(matthews_corrcoef(Y_test, X_Test_Prediction))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78       298
           1       0.78      0.79      0.78       298

    accuracy                           0.78       596
   macro avg       0.78      0.78      0.78       596
weighted avg       0.78      0.78      0.78       596

[[231  67]
 [ 63 235]]
0.5638091828819275


Random Forest

In [20]:
X_Train_RFPrediction = model1.predict(X_Train)
Training_Data_RFAccuracy = accuracy_score(Y_Train, X_Train_RFPrediction)
print('Accuracy Score on the Training Data: ', Training_Data_RFAccuracy)

Accuracy Score on the Training Data:  0.9995798319327731


In [21]:
X_Test_RFPrediction = model1.predict(X_test)
Test_Data_RFAccuracy = accuracy_score(Y_test, X_Test_RFPrediction)
print('Accuracy Score on the Test Data: ', Test_Data_RFAccuracy)

Accuracy Score on the Test Data:  0.8120805369127517


In [22]:
print(classification_report(Y_test, X_Test_RFPrediction))
print(confusion_matrix(Y_test, X_Test_RFPrediction))
print(matthews_corrcoef(Y_test, X_Test_RFPrediction))

              precision    recall  f1-score   support

           0       0.87      0.73      0.80       298
           1       0.77      0.89      0.83       298

    accuracy                           0.81       596
   macro avg       0.82      0.81      0.81       596
weighted avg       0.82      0.81      0.81       596

[[219  79]
 [ 33 265]]
0.6317328381028974


Decision Tree Classifier

In [23]:
X_Train_DTPrediction = model2.predict(X_Train)
Training_Data_DTAccuracy = accuracy_score(Y_Train, X_Train_DTPrediction)
print('Accuracy Score on the Training Data: ', Training_Data_DTAccuracy)

Accuracy Score on the Training Data:  0.9995798319327731


In [24]:
X_Test_DTPrediction = model2.predict(X_test)
Test_Data_DTAccuracy = accuracy_score(Y_test, X_Test_DTPrediction)
print('Accuracy Score on the Test Data: ', Test_Data_DTAccuracy)

Accuracy Score on the Test Data:  0.7567114093959731


In [25]:
print(classification_report(Y_test, X_Test_DTPrediction))
print(confusion_matrix(Y_test, X_Test_DTPrediction))
print(matthews_corrcoef(Y_test, X_Test_DTPrediction))

              precision    recall  f1-score   support

           0       0.74      0.80      0.77       298
           1       0.78      0.71      0.75       298

    accuracy                           0.76       596
   macro avg       0.76      0.76      0.76       596
weighted avg       0.76      0.76      0.76       596

[[238  60]
 [ 85 213]]
0.5152391393659149


Saving Trained-Model

In [None]:
# import pickle

In [None]:
# filename = 'Logistic_Regression.sav'
# pickle.dump(model, open(filename, 'wb'))

Using the saved model

In [None]:
# load_model = pickle.load(open('D:\\Semester 4\\Natural Language Processing\\Lab Praktikum\\Research Methodology Project\\Logistic_Regression.sav', 'rb'))

In [None]:
# X_new = X_test[3]
# print(X_String[3])

# prediction = model.predict(X_new)
# print(prediction)

# if(prediction[0] == 0):
#     print("Non-Stress Text")
# else:
#     print("Stress Text")