In [40]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0


In [68]:
#import necessary libraries 

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
from imblearn.over_sampling import RandomOverSampler


In [69]:
#we will laod our datasets 

email_data = pd.read_csv('spam_data/email-spam/email.csv', encoding='latin1')
sms_data = pd.read_csv('spam_data/sms-spam/spam.csv', encoding='latin1') 
combo_data = pd.read_csv('spam_data/hybrid-spam/rcs_sms_spam.csv', encoding='latin1') 


email_df = pd.DataFrame(email_data)
sms_df = pd.DataFrame(sms_data)
combo_df = pd.DataFrame(combo_data)

combo_df.head(3)
print(combo_df.columns)


Index(['ï»¿Message ', 'Verified', 'Category ', 'MessageType'], dtype='object')


In [70]:
#to get our data ready for regression we msut make the labels numeric 
#for both data sets we will change 'ham'->0 and 'spam'-> 1 

#do some data cleaning for some issues
combo_df.columns = combo_df.columns.str.strip()
combo_df = combo_df.rename(columns={'ï»¿Message':'Message'})
valid_labels = ['ham', 'spam']
combo_df = combo_df[combo_df['Category'].isin(valid_labels)]
email_df = email_df[email_df['Category'].isin(valid_labels)]


sms_labels = sms_df['v1']
sms_data = sms_df['v2'] 

email_labels = email_df['Category']
email_data = email_df['Message'] 

combo_labels = combo_df['Category'] 
combo_data = combo_df[['Message', 'Verified','MessageType']]
#make the conversion 
sms_labels = sms_labels.map({'ham': 0, 'spam': 1})
email_labels = email_labels.map({'ham': 0, 'spam': 1})
combo_labels = combo_labels.map({'ham': 0, 'spam': 1})

combo_labels.head(3)



0    0
1    0
2    0
Name: Category, dtype: int64

In [2]:
#now that are data is cleaned up we can create the train.test splits 
training_sms_data, testing_sms_data, training_sms_labels, testing_sms_labels = train_test_split(sms_data, sms_labels, test_size=0.3)
training_email_data, testing_email_data, training_email_labels, testing_email_labels = train_test_split(email_data, email_labels, test_size=0.3)
training_combo_data, testing_combo_data, training_combo_labels, testing_combo_labels = train_test_split(combo_data, combo_labels, test_size=0.3)


training_combo_message = training_combo_data['Message']
testing_combo_message = testing_combo_data['Message']

#vectorize the data again (just messages in case of combo data ) 
vectorizer1 = TfidfVectorizer(stop_words='english')
training_sms_data_vect = vectorizer1.fit_transform(training_sms_data)
testing_sms_data_vect = vectorizer1.transform(testing_sms_data)

vectorizer2 = TfidfVectorizer(stop_words='english')
training_email_data_vect = vectorizer2.fit_transform(training_email_data)
testing_email_data_vect = vectorizer2.transform(testing_email_data)

vectorizer3 = TfidfVectorizer(stop_words='english')
training_combo_message_vect = vectorizer3.fit_transform(training_combo_message)
testing_combo_message_vect = vectorizer3.transform(testing_combo_message)

train_combo_meta = training_combo_data[['Verified', 'MessageType']].values
test_combo_meta = testing_combo_data[['Verified', 'MessageType']].values

training_combo_data_vect = hstack([training_combo_message_vect, csr_matrix(train_combo_meta)], format='csr')
testing_combo_data_vect = hstack([testing_combo_message_vect, csr_matrix(test_combo_meta)], format='csr')

#just to experiement... 
ros = RandomOverSampler()
x_resampled, y_resampled = ros.fit_resample(training_combo_data_vect, training_combo_labels)

#now we can train the models 

lr_sms = LogisticRegression(max_iter = 1000)
lr_sms.fit(training_sms_data_vect, training_sms_labels)

lr_email = LogisticRegression(max_iter = 1000)
lr_email.fit(training_email_data_vect, training_email_labels)

lr_combo =  LogisticRegression(max_iter = 1000)
#lr_combo.fit(training_combo_data_vect, training_combo_labels)
lr_combo.fit(x_resampled, y_resampled)


NameError: name 'train_test_split' is not defined

In [1]:
# and now we can predict and print results 

pred_sms = lr_sms.predict(testing_sms_data_vect)
pred_email = lr_email.predict(testing_email_data_vect) 
pred_combo = lr_combo.predict(testing_combo_data_vect) 

testing_sms_labels = testing_sms_labels.fillna(0)
testing_email_labels = testing_email_labels.fillna(0)
testing_combo_labels = testing_combo_labels.fillna(0)


print("SMS Logistic Regression")
print("Precision:", precision_score(testing_sms_labels, pred_sms))
print("Recall:", recall_score(testing_sms_labels, pred_sms))
print("F1:", f1_score(testing_sms_labels, pred_sms))
print("Accuracy:", accuracy_score(testing_sms_labels, pred_sms))

print("\nEmail Logistic Regression")
print("Precision:", precision_score(testing_email_labels, pred_email))
print("Recall:", recall_score(testing_email_labels, pred_email))
print("F1:", f1_score(testing_email_labels, pred_email))
print("Accuracy:", accuracy_score(testing_email_labels, pred_email))

print("\nCombo Logistic Regression")
print("Precision:", precision_score(testing_combo_labels, pred_combo))
print("Recall:", recall_score(testing_combo_labels, pred_combo))
print("F1:", f1_score(testing_combo_labels, pred_combo))
print("Accuracy:", accuracy_score(testing_combo_labels, pred_combo))

print("\nCombo Logistic Regression + Logic with Custom Data ")
hits = misses = 0 
for index,row in combo_df.iterrows(): 
    if row['Category'] == 'spam': 
        if predict_spam(row) == 1: 
            hits+=1 
            continue 
        else: 
            misses += 1 
            continue 
    if row['Category'] == 'ham': 
        if predict_spam(row) == 0:
            hits += 1 
            continue 
        else: 
            misses+=1
            continue

print("Accuracy: ", hits / (hits + misses))

print("\nSMS Logistic Regression with Custom Data ")
hits = misses = 0 
for index,row in combo_df.iterrows(): 
    message_vect = vectorizer1.transform([row['Message']])
    prediction = lr_sms.predict(message_vect) 

    if prediction == 1:
        if row['Category'] == 'spam':
            hits+=1 
        else:
            misses +=1 
    else:
        if row['Category'] == 'ham':
            hits +=1 
        else:
            misses += 1

print("Accuracy: ", hits / (hits + misses))
    
    

NameError: name 'lr_sms' is not defined

In [76]:
#Now we will implement our hybrid model that uses rcs and sms as paramers in addition to the message 
def predict_spam(data):
   # print("Message: ", data['Message'])
    if data['MessageType'] == 1: #RCS 
        if data['Verified'] ==1: #verified by Google 
            #print("0")
            return 0 # not spam 
             #SMS or unverified RCS 
    msg_vect = vectorizer1.transform([data['Message']])#sti.l need to convert message
    #print("USING ML", lr_sms.predict(msg_vect)[0])
    return lr_sms.predict(msg_vect)[0]


            
