In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

#open file
train = pd.read_csv(r'train.csv')

#Look at first 5 lines of data:
#train.head()

#Merge x text and y text together so I can put them in a matrix
train['descriptions'] = train['description_x'].str.cat(train['description_y'],sep=" ")

#making vectors
count_vectorizer = CountVectorizer(ngram_range = (1, 1)) 
#(1, 1 means unigrams; 2, 2 for bigrams, 3, 3 for trigrams)
#Note: if you change the above, restart the kernel before running again or it won't come out right

tokenlinelist = train['descriptions'][0:1713] #split data -- ratio 80:20
X_train = count_vectorizer.fit_transform(tokenlinelist) #bag of words
y_train = train['same_security'][0:1713]

#split test set
tokenlinelist_test = train['descriptions'][1714:2142]
X_test = count_vectorizer.transform(tokenlinelist_test)
y_test = train['same_security'][1714:2142]

#Logistic Regresion:
#train my LR
clf = LogisticRegression().fit(X_train, y_train)
#Make LR predictions
LRpreds = clf.predict(X_test)
#print(LRpreds)

#KNN classifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train, y_train)

knc = neigh.predict(X_test)
#knc

#Gaussian Naive Bayes Classifier
gnb = GaussianNB()
gnbpreds = gnb.fit(X_train.toarray(), y_train).predict(X_test.toarray())
#gnbpreds

#SVM Classifier:
svm = svm.SVC()
svm.fit(X_train, y_train)
svmpreds = svm.predict(X_test)

#LinearSVC
lsvc = make_pipeline(StandardScaler(with_mean = False), LinearSVC(random_state=0, tol=1e-5)) 
#had to add with_mean = False because I'm working with sparse matrix
lsvc.fit(X_train, y_train)

lsvcpreds = lsvc.predict(X_test)

#Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train.toarray(), y_train) #had to add .toarray() because sparse matrix

ldapreds = lda.predict(X_test)

#Multi-Layer Perceptron 
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=333)
mlp.fit(X_train, y_train)

mlppreds = mlp.predict(X_test)

#Wrangling predictions into one dataframe
#Contains test set true classes and my various predicted classes
predsdf = pd.DataFrame(LRpreds)
truedf = pd.DataFrame(y_test)
truedf = truedf.reset_index()
knndf = pd.DataFrame(knc)
gnbdf = pd.DataFrame(gnbpreds)
lsvcdf = pd.DataFrame(lsvcpreds)
predsdf['true'] = truedf['same_security']
predsdf = predsdf.rename(columns={0: 'LRpreds'})
knndf = knndf.rename(columns={0:'KNNpreds'})
svmdf = pd.DataFrame(svmpreds)
ldadf = pd.DataFrame(ldapreds)
mlpdf = pd.DataFrame(mlppreds)
predsdf['LRcorrect'] = predsdf['LRpreds'] == predsdf['true']
predsdf['KNNpreds'] = knndf
predsdf['KNNcorrect'] = predsdf['KNNpreds'] == predsdf['true']
predsdf['GNBpreds'] = gnbdf
predsdf['GNBcorrect'] = predsdf['GNBpreds'] == predsdf['true']
predsdf['SVMpreds'] = svmdf
predsdf['SVMcorrect'] = predsdf['SVMpreds'] == predsdf['true']
predsdf['LinearSVCpreds'] = lsvcdf
predsdf['LinearSVCcorrect'] = predsdf['LinearSVCpreds'] == predsdf['true']
predsdf['LDApreds'] = ldadf
predsdf['LDAcorrect'] = predsdf['LDApreds'] == predsdf['true']
predsdf['MLPpreds'] = mlpdf
predsdf['MLPcorrect'] = predsdf['MLPpreds'] == predsdf['true']
#predsdf.head()

#combining all prediction measures together:
#If more than half the measures say true, then make a true prediction; same applies to false
combodf = predsdf[['KNNpreds', 'GNBpreds', 'SVMpreds', 'LinearSVCpreds', 'LDApreds', 'MLPpreds']]
combosumpreds = combodf.sum(axis=1)
sumdf = pd.DataFrame(combosumpreds)
combodf['sum'] = sumdf
combodf['sumpreds'] = np.where(combodf['sum'] >= 3, True, False)
#combodf
df7 = combodf.join(predsdf['true'])
df7['sumpredscorrect'] = df7['sumpreds'] == df7['true']
#df7
#print(df7['sumpredscorrect'])


print('Results for Bag of Words:')
print("Logistic Regression Percent Correct: ", predsdf.LRcorrect.sum()/len(predsdf)*100)
print("K Nearest Neighbor Percent Correct: ", predsdf.KNNcorrect.sum()/len(predsdf)*100)
print("Gaussian Naive Bayes Percent Correct: ", predsdf.GNBcorrect.sum()/len(predsdf)*100)
print("Support Vector Machine Percent Correct: ", predsdf.SVMcorrect.sum()/len(predsdf)*100)
print("Linear SVC Percent Correct: ", predsdf.LinearSVCcorrect.sum()/len(predsdf)*100)
print("Linear Discriminant Analysis Percent Correct: ", predsdf.LDAcorrect.sum()/len(predsdf)*100)
print("Multi-Layer Perceptron Percent Correct: ", predsdf.MLPcorrect.sum()/len(predsdf)*100)
print("\n")
print("Accuracy of all of the above combined: ", df7.sumpredscorrect.sum()/len(df7)*100)



Results for Bag of Words:
Logistic Regression Percent Correct:  91.82242990654206
K Nearest Neighbor Percent Correct:  91.58878504672897
Gaussian Naive Bayes Percent Correct:  77.33644859813083
Support Vector Machine Percent Correct:  93.69158878504673
Linear SVC Percent Correct:  83.8785046728972
Linear Discriminant Analysis Percent Correct:  90.42056074766354
Multi-Layer Perceptron Percent Correct:  92.5233644859813


Accuracy of all of the above combined:  92.99065420560748


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combodf['sum'] = sumdf
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combodf['sumpreds'] = np.where(combodf['sum'] >= 3, True, False)


In [1]:
#TF-IDF: unigram, bigram, and trigram
#Same code as above with some tf-idf stuff in it
#Again, may want to restart kernel before running

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

#open file
train = pd.read_csv(r'train.csv')

#Look at first 5 lines of data:
#train.head()

#Merge x text and y text together so I can put them in a matrix
train['descriptions'] = train['description_x'].str.cat(train['description_y'],sep=" ")

#making vectors
count_vectorizer = CountVectorizer(ngram_range = (1, 1))
#above line: 1, 1 = unigram, 2, 2 = bigram, 3, 3, = trigram
#Note: restart kernel and run again when changing
tfidf_transformer = TfidfTransformer()

tokenlinelist = train['descriptions'][0:1713] #split data -- ratio 80:20
count_matrix = count_vectorizer.fit_transform(tokenlinelist) #bag of words
X_train = tfidf_transformer.fit_transform(count_matrix) #TF-IDF
y_train = train['same_security'][0:1713]

#split test set
tokenlinelist_test = train['descriptions'][1714:2142]
count_matrix_test = count_vectorizer.transform(tokenlinelist_test)
X_test = tfidf_transformer.fit_transform(count_matrix_test)
y_test = train['same_security'][1714:2142]

#Logistic Regresion:
#train my LR
clf = LogisticRegression().fit(X_train, y_train)
#Make LR predictions
LRpreds = clf.predict(X_test)
#print(LRpreds)

#KNN classifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train, y_train)

knc = neigh.predict(X_test)
#knc

#Gaussian Naive Bayes Classifier
gnb = GaussianNB()
gnbpreds = gnb.fit(X_train.toarray(), y_train).predict(X_test.toarray())
#gnbpreds

#SVM Classifier:
svm = svm.SVC()
svm.fit(X_train, y_train)
svmpreds = svm.predict(X_test)

#LinearSVC
lsvc = make_pipeline(StandardScaler(with_mean = False), LinearSVC(random_state=0, tol=1e-5)) 
#had to add with_mean = False because I'm working with sparse matrix
lsvc.fit(X_train, y_train)

lsvcpreds = lsvc.predict(X_test)

#Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train.toarray(), y_train) #had to add .toarray() because sparse matrix

ldapreds = lda.predict(X_test)

#Multi-Layer Perceptron 
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=333)
mlp.fit(X_train, y_train)

mlppreds = mlp.predict(X_test)

#Wrangling predictions into one dataframe
#Contains test set true classes and my various predicted classes
predsdf = pd.DataFrame(LRpreds)
truedf = pd.DataFrame(y_test)
truedf = truedf.reset_index()
knndf = pd.DataFrame(knc)
gnbdf = pd.DataFrame(gnbpreds)
lsvcdf = pd.DataFrame(lsvcpreds)
predsdf['true'] = truedf['same_security']
predsdf = predsdf.rename(columns={0: 'LRpreds'})
knndf = knndf.rename(columns={0:'KNNpreds'})
svmdf = pd.DataFrame(svmpreds)
ldadf = pd.DataFrame(ldapreds)
mlpdf = pd.DataFrame(mlppreds)
predsdf['LRcorrect'] = predsdf['LRpreds'] == predsdf['true']
predsdf['KNNpreds'] = knndf
predsdf['KNNcorrect'] = predsdf['KNNpreds'] == predsdf['true']
predsdf['GNBpreds'] = gnbdf
predsdf['GNBcorrect'] = predsdf['GNBpreds'] == predsdf['true']
predsdf['SVMpreds'] = svmdf
predsdf['SVMcorrect'] = predsdf['SVMpreds'] == predsdf['true']
predsdf['LinearSVCpreds'] = lsvcdf
predsdf['LinearSVCcorrect'] = predsdf['LinearSVCpreds'] == predsdf['true']
predsdf['LDApreds'] = ldadf
predsdf['LDAcorrect'] = predsdf['LDApreds'] == predsdf['true']
predsdf['MLPpreds'] = mlpdf
predsdf['MLPcorrect'] = predsdf['MLPpreds'] == predsdf['true']
#predsdf.head()

#combining all prediction measures together:
#If more than half the measures say true, then make a true prediction; same applies to false
combodf = predsdf[['KNNpreds', 'GNBpreds', 'SVMpreds', 'LinearSVCpreds', 'LDApreds', 'MLPpreds']]
combosumpreds = combodf.sum(axis=1)
sumdf = pd.DataFrame(combosumpreds)
combodf['sum'] = sumdf
combodf['sumpreds'] = np.where(combodf['sum'] >= 3, True, False)
#combodf
df7 = combodf.join(predsdf['true'])
df7['sumpredscorrect'] = df7['sumpreds'] == df7['true']
#df7
#print(df7['sumpredscorrect'])


print('Results for TF-IDF:')
print("Logistic Regression Percent Correct: ", predsdf.LRcorrect.sum()/len(predsdf)*100)
print("K Nearest Neighbor Percent Correct: ", predsdf.KNNcorrect.sum()/len(predsdf)*100)
print("Gaussian Naive Bayes Percent Correct: ", predsdf.GNBcorrect.sum()/len(predsdf)*100)
print("Support Vector Machine Percent Correct: ", predsdf.SVMcorrect.sum()/len(predsdf)*100)
print("Linear SVC Percent Correct: ", predsdf.LinearSVCcorrect.sum()/len(predsdf)*100)
print("Linear Discriminant Analysis Percent Correct: ", predsdf.LDAcorrect.sum()/len(predsdf)*100)
print("Multi-Layer Perceptron Percent Correct: ", predsdf.MLPcorrect.sum()/len(predsdf)*100)
print("\n")
print("Accuracy of all of the above combined: ", df7.sumpredscorrect.sum()/len(df7)*100)



Results for TF-IDF:
Logistic Regression Percent Correct:  91.1214953271028
K Nearest Neighbor Percent Correct:  90.18691588785047
Gaussian Naive Bayes Percent Correct:  77.57009345794393
Support Vector Machine Percent Correct:  92.99065420560748
Linear SVC Percent Correct:  82.94392523364486
Linear Discriminant Analysis Percent Correct:  90.88785046728972
Multi-Layer Perceptron Percent Correct:  92.05607476635514


Accuracy of all of the above combined:  92.99065420560748


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combodf['sum'] = sumdf
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combodf['sumpreds'] = np.where(combodf['sum'] >= 3, True, False)


In [3]:
#Tensor Flow RNN
#https://github.com/MGCodesandStats/tensorflow-nlp/blob/master/spam%20detection%20tensorflow%20v1.ipynb
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

dataset = pd.read_csv(r'train.csv')

dataset['descriptions'] = dataset['description_x'].str.cat(dataset['description_y'],sep=" ")

sentences = dataset['descriptions'].tolist()
labels = dataset['same_security'].tolist() # Separate out the sentences and labels into training and validation sets
training_sentences = sentences[0:1285]
testing_sentences = sentences[1286:1713]
training_labels = labels[0:1285]
testing_labels = labels[1286:1713] # Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

#tweaked these several times; these settings kind of got the best accuracy
vocab_size = 5000 
embedding_dim = 512
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

#tokenizing and making vectors

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)

padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,  truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunc_type)

#making the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

#10 and 30 epochs did worse
num_epochs = 20
history=model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

# Create the sequences
pred_sentences = sentences[1714:2142]


# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(pred_sentences)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           

#make prediction
pred_classes = model.predict(fakes_padded)

# The closer the class is to 1, the more likely that the message is spam
tfrnndf = pd.DataFrame(pred_classes)
tfrnndf['TFRNNpreds'] = np.where(tfrnndf[0] >= .9, True, False)
tfrnndf['TFRNNcorrect'] = tfrnndf['TFRNNpreds'] == predsdf['true'] #FIX so it works independently
print("Tensorflow RNN Percent Correct: ", tfrnndf.TFRNNcorrect.sum()/len(tfrnndf)*100)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 512)          2560000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 51200)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 307206    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 2,867,213
Trainable params: 2,867,213
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tensorflow 