# Predictions
This is our code for taking in a user's text and making a prediction based on that. 

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# set configurations
pd.set_option('display.max_columns', 100)
sns.set_style("white")

# model imports
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
import pickle
import joblib

# NLP Imports
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

from PIL import Image
import wordninja

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayaanhaque/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayaanhaque/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def processing_text(series_to_process):
    new_list = []
    tokenizer = RegexpTokenizer(r'(\w+)')
    lemmatizer = WordNetLemmatizer()

    for i in range(len(series_to_process)):
        # tokenized item in a new list
        dirty_string = (series_to_process)[i].lower()
        words_only = tokenizer.tokenize(dirty_string) # words_only is a list of only the words, no punctuation
        #Lemmatize the words_only
        words_only_lem = [lemmatizer.lemmatize(i) for i in words_only]
        # removing stop words
        words_without_stop = [i for i in words_only_lem if i not in stopwords.words("english")]
        # return seperated words
        long_string_clean = " ".join(word for word in words_without_stop)
        new_list.append(long_string_clean)
        return new_list

In [41]:
text = 'Hello. Depression has always been a secondary problem for me, with my main antagonist being severe Harm OCD. But since my relationship ended 8 months ago, I have been stuck in this horrific cycle of absolutely loathing myself, feeling heavy/tired and totally unmotivated to do anything. It is like I am living in a 2 dimensional world. Nothing in life jumps out and catches my attention like it used to. I used to be quite creative but it is just taken a nose dive. Any work I do is utterly awful and I am amazed I am not been kicked off projects (I work freelance). I wake up and I just want to be dead, quite honestly. In fact in the last few weeks I have even found getting out of bed to be a monumental struggle in itself, where I am almost in tears from the weight of everything.'
text_array = pd.Series(text)
print(type(text_array))
print(text_array)
processed_text = processing_text(text_array)

processed_array = pd.Series(processed_text)

print(processed_array)

<class 'pandas.core.series.Series'>
0    Hello. Depression has always been a secondary ...
dtype: object
0    hello depression ha always secondary problem m...
dtype: object


In [46]:
model = joblib.load("model2.h5")

tvec_optimised = TfidfVectorizer(max_features=70, ngram_range=(1, 3),stop_words = 'english')
processed_text_tvec = tvec_optimised.fit_transform(processed_array).todense()

# vectorizer.fit(processed_text)

# vectorizer = CountVectorizer()
# vector = vectorizer.transform([processed_text[0]]).toarrary()


prediction = model.predict(processed_text_tvec)
print(prediction[0])

1


In [14]:
X = model_data["megatext_clean"]
print(X)
y = model_data['is_suicide']
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

tvec_optimised = TfidfVectorizer(max_df= 0.5, max_features=70, min_df=2, ngram_range=(1, 3),stop_words = 'english')
X_train_tvec = tvec_optimised.fit_transform(X_train).todense()
X_test_tvec = tvec_optimised.transform(X_test).todense()

['hello depression ha always secondary problem main antagonist severe harm ocd since relationship ended 8 month ago stuck horrific cycle absolutely loathing feeling heavy tired totally unmotivated anything like living 2 dimensional world nothing life jump catch attention like used used quite creative taken nose dive work utterly awful amazed kicked project work freelance wake want dead quite honestly fact last week even found getting bed monumental struggle almost tear weight everything']


NameError: name 'model_data' is not defined