In [2]:
#import necessary libraries 

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [3]:
#import our data 

email_data = pd.read_csv('spam_data/email-spam/email.csv', encoding='latin1')
sms_data = pd.read_csv('spam_data/sms-spam/spam.csv', encoding='latin1') 
combo_data = pd.read_csv('spam_data/hybrid-spam/rcs_sms_spam.csv', encoding='latin1') 


email_df = pd.DataFrame(email_data)
sms_df = pd.DataFrame(sms_data)
combo_df = pd.DataFrame(combo_data)



In [8]:
#now we will need to clean our datasets if needed and observe the first few rows
sms_data.head(3)

#it appears 3 unnamed columns appear after v2 in this data set so we will remove them first 
sms_df = sms_df.loc[:, :'v2'] 
#fix xombo bugs
combo_df.columns = combo_df.columns.str.strip()
combo_df = combo_df.rename(columns={'ï»¿Message':'Message'})
valid_labels = ['ham', 'spam']
combo_df = combo_df[combo_df['Category'].isin(valid_labels)]
email_df = email_df[email_df['Category'].isin(valid_labels)]


#double check it worked YAY
sms_df.head(3)



Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [6]:
#email data looks good already 
email_data.head(3)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [27]:
#now we can do our random forest classifier for sms 

from scipy.sparse import hstack, csr_matrix, vstack #MOVE THIS INTO TOP BOX WHEN JUPYTER STOPS BUGGING OUT

#first create the training and test sets 
sms_labels = sms_df['v1']
sms_data = sms_df['v2'] 
combo_labels = combo_df['Category'] 
combo_data = combo_df.loc[:, ['Message', 'MessageType', 'Verified']]

training_sms_data, testing_sms_data, training_sms_labels, testing_sms_labels = train_test_split(sms_data, sms_labels, test_size=0.3)
training_combo_data, testing_combo_data, training_combo_labels, testing_combo_labels = train_test_split(combo_data, combo_labels, test_size=0.3)

#since the random forset needs numbers and not words we vectorize the text into numbers 
vectorizer = TfidfVectorizer(stop_words='english')
training_sms_data_vect = vectorizer.fit_transform(training_sms_data)
testing_sms_data_vect = vectorizer.transform(testing_sms_data)

#for the combo we only need to vecrorize the message as other values are numeric yay
vectorizer_combo = TfidfVectorizer(stop_words='english')
combo_msg_train_vect = vectorizer_combo.fit_transform(training_combo_data['Message'])
combo_msg_test_vect = vectorizer_combo.transform(testing_combo_data['Message'])

combo_messages_for_sms = vectorizer.transform(combo_data['Message']) 

combo_other_train = training_combo_data[['MessageType', 'Verified']].values
combo_other_test = testing_combo_data[['MessageType', 'Verified']].values
#print("len of combo_other_train", len(combo_other_train), "len of combo_mssage_train", len(combo_msg_train_vect))
train_combo = hstack([combo_msg_train_vect, csr_matrix(combo_other_train)], format='csr')
test_combo = hstack([combo_msg_test_vect, csr_matrix(combo_other_test)], format='csr')


all_combo = vstack([train_combo, test_combo], format='csr')
all_combo_labels = np.concatenate(
    [training_combo_labels, testing_combo_labels]
)

#trrain the model
sms_forest = RandomForestClassifier() 
sms_forest.fit(training_sms_data_vect, training_sms_labels) 

combo_forest = RandomForestClassifier() 
combo_forest.fit(train_combo, training_combo_labels) 



0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
#now we can see what our results look like 
sms_predict = sms_forest.predict(testing_sms_data_vect) 

print("Predicting Info SMS Random Forest Detection :")
print("Accuracy: ", accuracy_score(testing_sms_labels,sms_predict) )
print("Precision: ", precision_score(testing_sms_labels, sms_predict, pos_label='spam'))
print("Recall: ", recall_score(testing_sms_labels,sms_predict,pos_label='spam' ) )
print("F1: ", f1_score(testing_sms_labels,sms_predict,pos_label='spam' ) )

Predicting Info SMS Random Forest Detection :
Accuracy:  0.9742822966507177
Precision:  0.9875
Recall:  0.7939698492462312
F1:  0.8802228412256268


In [28]:
#now we can see what our results look like 
#these results are pretty terrible but it is most likely from the size of the dataset. 
print("Predicting Info COMBO Random Forest Detection using CUSTOM DATA :")
print("Accuracy: ", accuracy_score(testing_combo_labels,combo_predict) )
print("Precision: ", precision_score(testing_combo_labels, combo_predict, pos_label='spam'))
print("Recall: ", recall_score(testing_combo_labels,combo_predict,pos_label='spam' ) )
print("F1: ", f1_score(testing_combo_labels,combo_predict,pos_label='spam' ) )

sms_predict2 = sms_forest.predict(combo_messages_for_sms) 

print("Predicting Info SMS Random Forest Detection USING CUSTOM DATA  :")
print("Accuracy: ", accuracy_score(all_combo_labels,sms_predict2) )
print("Precision: ", precision_score(all_combo_labels, sms_predict2, pos_label='spam'))
print("Recall: ", recall_score(all_combo_labels,sms_predict2,pos_label='spam' ) )
print("F1: ", f1_score(all_combo_labels,sms_predict2,pos_label='spam' ) )



Predicting Info COMBO Random Forest Detection using CUSTOM DATA :
Accuracy:  0.6666666666666666
Precision:  0.0
Recall:  0.0
F1:  0.0
Predicting Info SMS Random Forest Detection USING CUSTOM DATA  :
Accuracy:  0.631578947368421
Precision:  0.3333333333333333
Recall:  0.16666666666666666
F1:  0.2222222222222222


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
# repeat the training process for email dataset 


email_labels = email_df['Category'] 
email_data = email_df['Message'] 

training_email_data, testing_email_data, training_email_labels, testing_email_labels = train_test_split(email_data, email_labels, test_size=0.3)

vectorizer = TfidfVectorizer(stop_words='english')
training_email_data_vect = vectorizer.fit_transform(training_email_data)
testing_email_data_vect = vectorizer.transform(testing_email_data)

email_forest = RandomForestClassifier() 
email_forest.fit(training_email_data_vect, training_email_labels) 


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
#now we can see what our results look like 
email_predict = email_forest.predict(testing_email_data_vect) 

print("Predicting Info Email Random Forest Detection :")
print("Accuracy: ", accuracy_score(testing_email_labels,email_predict) )
print("Precision: ", precision_score(testing_email_labels, email_predict, pos_label='spam'))
print("Recall: ", recall_score(testing_email_labels,email_predict,pos_label='spam' ) )
print("F1: ", f1_score(testing_email_labels,email_predict,pos_label='spam' ) )

Predicting Info Email Random Forest Detection :
Accuracy:  0.97188995215311
Precision:  1.0
Recall:  0.783410138248848
F1:  0.8785529715762274


In [47]:

print("\nCombo Random Forest Regression + Logic with Custom Data ")
hits = misses = 0 
for index,row in combo_df.iterrows(): 
    if row['Category'] == 'spam': 
        if predict_spam(row) == 'spam': 
            hits+=1 
            continue 
        else: 
            misses += 1 
            continue 
    if row['Category'] == 'ham': 
        if predict_spam(row) == 'ham':
            hits += 1 
            continue 
        else: 
            misses+=1
            continuea

print("Accuracy: ", hits / (hits + misses))

print("\nSMS Random Forest with Custom Data ")

hits = misses = 0 
for index,row in combo_df.iterrows(): 
    message_vect = vectorizer.transform([row['Message']])
    prediction = sms_forest.predict(message_vect) 

    if prediction == 'spam':
        if row['Category'] == 'spam':
            hits+=1 
        else:
            misses +=1 
    else:
        if row['Category'] == 'ham':
            hits +=1 
        else:
            misses += 1

print("Accuracy: ", hits / (hits + misses))

    


Combo Random Forest Regression + Logic with Custom Data 
verified ham
verified ham
verified ham
verified ham
verified ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML spam
USING ML ham
USING ML ham
USING ML ham
USING ML ham
USING ML ham
Accuracy:  0.7368421052631579

SMS Random Forest with Custom Data 
Accuracy:  0.631578947368421


In [43]:
#Now we will implement our hybrid model that uses rcs and sms as paramers in addition to the message 
def predict_spam(data):
   # print("Message: ", data['Message'])
    if data['MessageType'] == 1: #RCS 
        if data['Verified'] ==1: #verified by Google 
            print("verified ham")
            return 'ham' # not spam 
             #SMS or unverified RCS 
    msg_vect = vectorizer.transform([data['Message']])#sti.l need to convert message
    #print("USING ML", sms_forest.predict(msg_vect)[0])
    return sms_forest.predict(msg_vect)[0]

