In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
sns.set_style("white")

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# IF YOU ARE MISSING "WordCloud":
# TRY INSTALLING VIA TERMINAL LIKE THIS: /anaconda3/bin/python -m pip install wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
# IF YOU ARE MISSING "wordninja":
# TRY INSTALLING VIA TERMINAL LIKE THIS: pip install wordninja
import wordninja

In [2]:
model_data = pd.read_csv("../../mentalhealth.csv", encoding='cp1252')
model_data.tail(3)

Unnamed: 0,Text,MH
318,sad,1
319,sad,1
320,sad,1


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

model_data['Text'] = model_data['Text'].apply(stemSentence)
model_data=model_data[['Text','MH']]

# Evaluating models

In [4]:
#CHECKING SCORES OF THE OPTIMISED MODEL USING TEST DATA
#DEFINING X and y
X = model_data["Text"]
y = model_data['MH']
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

#FINDING THE ACCURACY SCORE ON THE TEST DATA
nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)
accuracy = nb.score(X_test_tvec, y_test)

#CALCULATING AREA UNDER THE CURVE

pred_proba = [i[1] for i in nb.predict_proba(X_test_tvec)] 
auc = roc_auc_score(y_test, pred_proba)

print("ACCURACY: {}\nAUC SCORE: {}".format(accuracy, auc) )

ACCURACY: 0.8641975308641975
AUC SCORE: 0.892219387755102


# Test weights

In [5]:
inputtext="my daughter has depressed and needs counselling"
print(stemSentence(inputtext))

X1 = tvec_optimised.fit_transform(X).todense()
nb1 = MultinomialNB()
nb1.fit(X1, y)

inputtext_stemmed = stemSentence(inputtext)
nb1.predict_proba(tvec_optimised.transform([str(inputtext_stemmed)]).todense())[0][1]

my daughter ha depress and need counsel 


0.858250349554042

# Save weights

In [None]:
import pickle
pickle.dump(nb1, open('mentalhealth2', 'wb'))
pickle.dump(tvec_optimised, open("tvec2", "wb"))