# 1. Text Classification


In [1]:
# import libraries
import pandas as pd

In [2]:
# load dataset
data = pd.read_csv("metin_siniflandirma_spam_veri_seti.csv", encoding = "latin-1")
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace = True)
data.columns = ["label", "text"]

In [3]:
# EDA
print(data.isna().sum())

label    0
text     0
dtype: int64


# 2. Text Cleaning and Preprocessing


In [4]:
# import libraries and download necessary datasets
import nltk

nltk.download("stopwords") # cok kullanilan v eanlam tasimayan sozcukleri metin icerisinden cikartalim
nltk.download("wordnet") # lemma bulmak icin gerekli olan veriseti
nltk.download("omw-1.4") # wordnete ait farkli dillerin kelime anlamlarini iceren bir veri seti

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# import regular expressions and lemmatizer
import re 
from nltk.corpus import stopwords  # stopwords lerden kurtulmak icin
from nltk.stem import WordNetLemmatizer # lemmatization

In [6]:
text = list(data.text)
lemmatizer = WordNetLemmatizer()

In [7]:
corpus = []

for i in range(len(text)):
    
    r = re.sub("[^a-zA-Z]", " ", text[i]) # metin icerisinde harf olmayan tum karakterlerden kurtul
    
    r = r.lower() # buyuk harfi kucuk harf yap
    
    r = r.split() # kelimeleri ayir
    
    r = [word for word in r if word not in stopwords.words("english")] # stopwords lerden kurtul
    
    r = [lemmatizer.lemmatize(word) for word in r]
    
    r = " ".join(r)
    
    corpus.append(r)
    
data["text2"] = corpus

In [8]:
data

Unnamed: 0,label,text,text2
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


# 3. Model training and evalutaion

In [9]:
X = data["text2"]
y = data["label"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state=42)

In [10]:
# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)

# classifier training: model training and evaluation
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_cv, y_train)

X_test_cv = cv.transform(X_test)

In [11]:
#Prediction
prediction = dt.predict(X_test_cv)

from sklearn.metrics import confusion_matrix
c_matrix = confusion_matrix(y_test,prediction)

In [12]:
c_matrix

array([[958,   7],
       [ 22, 128]], dtype=int64)

In [13]:
accuracy = 100 * (sum(sum(c_matrix)) - c_matrix[1,0] - c_matrix[0,1]) / sum(sum(c_matrix))

In [16]:
print(accuracy)

97.39910313901345
