In [3]:
from sklearn.metrics import accuracy_score,precision_score
from sklearn.svm import LinearSVC
import numpy as np
import os
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import CountVectorizer
import re


In [4]:

def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    text= re.sub("<synch>","",text)
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text


In [5]:
X=pd.read_csv('foo.csv')
print (X)

#test train split
msk = np.random.rand(len(X)) < 0.7
print(msk)
train = X[msk]
test = X[~msk]


     speaker                                              value  PHQ8
0        303  okay how 'bout yourself here in california yea...     0
1        304  i'm doing good um from los angeles california ...     0
2        305  i'm doing alright uh originally i'm from calif...     0
3        310  yes it's okay <laughter> fine <laughter> i liv...     0
4        312  yes fine how about you here yes the weather we...     0
..       ...                                                ...   ...
102      485  <synch> yes i'm not bad i'm a little tired but...     0
103      486  <synch> yes i'm feel great i am from saint lou...     0
104      487  <synch> yes i'm fine thank you detroit michiga...     0
105      488  <synch> yes fine oh san fernando valley uh wel...     0
106      491  <synch> yes huh overwhelmed i have a funeral t...     0

[107 rows x 3 columns]
[False  True  True  True  True  True  True  True False  True  True  True
  True False False False  True  True  True  True  True  True  T

In [61]:
train

Unnamed: 0,speaker,value,PHQ8
1,304,i'm doing good um.from los angeles california....,0
2,305,i'm doing alright.uh originally i'm from calif...,0
4,312,yes.fine how about you.here.yes.the weather.we...,0
6,315,alright.yes.okay and you.inglewood california....,0
7,316,yes.i'm fine.new york.uh for business.now um i...,0
...,...,...,...
101,479,<synch>.yes.okay.l_a.uh the smog.traffic ticke...,0
102,485,<synch>.yes.i'm not bad i'm a little tired but...,0
103,486,<synch>.yes.i'm feel great.i am from saint lou...,0
105,488,<synch>.yes.fine.oh san fernando valley.uh wel...,0


In [63]:
print(len(train))
print("---------------------------------------------")
print(len(test))

67
---------------------------------------------
40


# Support Vector Machine

In [65]:
#Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text)

training_features = vectorizer.fit_transform(train["value"])    
test_features = vectorizer.transform(test["value"])

# Training
model = LinearSVC()
model.fit(training_features, train["PHQ8"])
y_pred = model.predict(test_features)


In [78]:
training_features

<67x30554 sparse matrix of type '<class 'numpy.float64'>'
	with 54542 stored elements in Compressed Sparse Row format>

In [66]:
acc = accuracy_score(test["PHQ8"], y_pred)
print(acc)
print('Precision score: ', precision_score(test['PHQ8'], y_pred))

0.75
Precision score:  0.75


In [67]:
print(y_pred)

[0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=True,stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))
training_features = vectorizer.fit_transform(train["value"])    
test_features = vectorizer.transform(test["value"])

# Training
model = LinearSVC()
model.fit(training_features, train["PHQ8"])
y_pred = model.predict(test_features)

In [69]:
acc = accuracy_score(test["PHQ8"], y_pred)
print(acc)

0.7


# Naive Bayessian

In [70]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

mnb = MultinomialNB()
mnb.fit(training_features, train["PHQ8"]) 
  
# making predictions on the testing set 
y_pred = mnb.predict(test_features)  

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy score: ', accuracy_score(test["PHQ8"], y_pred))
print('Precision score: ', precision_score(test['PHQ8'], y_pred))


Accuracy score:  0.7
Precision score:  0.0


  'precision', 'predicted', average, warn_for)


In [72]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [73]:
print(test['PHQ8'])

0      0
3      0
5      0
9      0
13     0
16     0
17     0
18     0
22     1
23     1
28     1
33     1
40     1
42     0
44     0
45     0
46     0
47     0
48     1
50     0
53     1
59     0
61     0
63     1
64     0
65     1
66     1
67     0
71     0
73     0
76     0
77     1
81     0
83     0
85     0
88     0
89     0
92     1
94     0
104    0
Name: PHQ8, dtype: int64


# Random Forest

In [74]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(training_features, train["PHQ8"])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [75]:
y_pred = clf.predict(test_features)

In [76]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [77]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy score: ', accuracy_score(test["PHQ8"], y_pred))
print('Precision score: ', precision_score(test['PHQ8'], y_pred))

Accuracy score:  0.725
Precision score:  1.0


# LSTM

In [88]:
!pip install tensorflow==2.2.0

^C


In [89]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000

# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(train['value'], maxlen=max_review_length)
X_test = sequence.pad_sequences(test['value'], maxlen=max_review_length)
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model


ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))