In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

# Import data 

In [None]:
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='ISO-8859-1')

In [None]:
data.head()

In [None]:
data.info()

# **Data Cleaning**

In [None]:
data=data[['v1','v2']].copy()

In [None]:
data.rename(columns={'v1':'class','v2':'text'},inplace=True)

In [None]:
data['target']=data['class'].map({'ham':0 ,'spam':1})

In [None]:
new_data=data[['target','text']]

In [None]:
# dublicate values
new_data.duplicated().sum()

In [None]:
new_data.drop_duplicates(inplace=True)

In [None]:
new_data.duplicated().sum()

In [None]:
new_data

# Visualization 

In [None]:
new_data['target'].value_counts().plot(kind='bar')
plt.xlabel('Target')
plt.title('spam:1 and ham:0')
plt.show()

* 1 is spam 
* 0 is ham 
* data is imbalance

# **Text Data Preprocessing**
* Lower case
*Tokenization
*Remove special characters 
*removing stop words and punctuaction
*stemming

In [None]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
import string

In [None]:
ps=PorterStemmer()

#### Function for transformation Text

In [None]:
def transform_data(data):
    data = data.lower()
    data= nltk.word_tokenize(data)
    
    y=[]
    for i in data:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words("english") and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
            
    return " ".join(y)

In [None]:
transform_data("hello guys I am ziad")

In [None]:
new_data['transformed_txt']=new_data.text.apply(transform_data)

In [None]:
spam_corpus = []
for msg in new_data[new_data['target']==1]['transformed_txt'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
Counter(spam_corpus).most_common(15)

In [None]:
ham_corpus = []
for msg in new_data[new_data['target']==0]['transformed_txt'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

# **Model**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer()

In [None]:
new_data.reset_index(inplace=True)

In [None]:
X = cv.fit_transform(new_data['transformed_txt']).toarray()
y=new_data['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix ,precision_score , recall_score
params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
gs=GridSearchCV(log_clf,params,cv=3,verbose=3)
gs.fit(X_train,y_train)

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
y_pred=gs.predict(X_test)


## Evaluation

In [None]:

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

In [None]:
cm=confusion_matrix(y_test,y_pred)
cm

## Confusion matrix 

In [None]:

sns.heatmap(cm,annot=True)
plt.show()