In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.feature_extraction.text import TfidfVectorizer
from stemming.porter2 import stem
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_json('../Datasets/Sms_spam_1.json').sort_index()
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5574, 2)

In [4]:
df['label'].unique()

array([0, 1])

In [5]:
df['label'].value_counts(normalize=True)

0    0.865985
1    0.134015
Name: label, dtype: float64

In [6]:
def tokenizer_porter(doc):
  return [stem(word) for word in doc.split()]

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', tokenizer = tokenizer_porter)

In [8]:
#%%time
x = vectorizer.fit_transform(df['message']).toarray()

In [9]:
y = df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)

In [None]:
estimator = LogisticRegression(penalty='l1')

parameters = {'C': (10**2, 10**3, 10**4, 10**5, 10**6)}

grid_search = GridSearchCV(estimator, parameters, cv =5, scoring='f1')
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
lr = grid_search.best_estimator_

In [None]:
y_pred = lr.predict(x_test)

In [None]:
table = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predictions'])

sbn.heatmap(table, cbar=True, cmap='Blues',annot=True, square=True, fmt='d', annot_kws={'size': 20})
plt.show()

In [None]:
lr.score(x_test,y_test) #Accuracy

In [None]:
f1_score(y_test,y_pred)