In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('./data_list/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)

In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### extraction!(reg) => not null, word

In [5]:
df = df[df['v2'].notnull()]
df['v2'] = df['v2'].apply(lambda x: re.sub(r'[^[a-zA-Z]*$', '', x))
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#### 중복값이 있다면 제거

In [6]:
df.drop_duplicates(subset=['v2'], inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### classification => train/test data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
            df['v2'],
            df['v1'],
            test_size=0.2,
            stratify=df['v1'],
            random_state=42
)

### stopwords & vectorize => choice(Counter, Tf-idf) : fit(train) —> transform(train,test)

In [8]:
tfidf = TfidfVectorizer(min_df=5, lowercase = True, stop_words='english')

In [9]:
tfidf.fit(X_train)

TfidfVectorizer(min_df=5, stop_words='english')

In [10]:
X_train =  tfidf.transform(X_train)
X_test= tfidf.transform(X_test)

In [11]:
print(X_train.shape, X_test.shape)

(4101, 1211) (1026, 1211)


In [12]:
tfidf.inverse_transform(X_train)

[array(['yogasana', 'yoga', 'power', 'hey'], dtype='<U15'),
 array(['luck', 'car', 'ask'], dtype='<U15'),
 array(['mr', 'ill', 'hell', 'forgot', 'believe'], dtype='<U15'),
 array(['time', 'spent', 'sounds', 'nice', 'lot', 'long', 'like', 'grins',
        'good', 'going', 'dog', 'boy', 'bath'], dtype='<U15'),
 array(['love', 'life', 'isn', 'feeling', 'decide'], dtype='<U15'),
 array(['won', 'urgent', 'reach', 'prize', 'mobile', 'caller', 'bonus',
        'attempt', 'asap', '2nd', '150ppm', '06', '03', '02', '000'],
       dtype='<U15'),
 array(['support'], dtype='<U15'),
 array(['true'], dtype='<U15'),
 array(['going', 'dat', 'aft'], dtype='<U15'),
 array(['wish', 'feel'], dtype='<U15'),
 array(['open', 'door'], dtype='<U15'),
 array(['told', 'thing', 'problem', 'know', 'just', 'dont', 'brother',
        'ask'], dtype='<U15'),
 array(['know', 'guys', 'don'], dtype='<U15'),
 array(['voice', 'speak', 'problem', 'mobile', 'listen', 'later', 'calls'],
       dtype='<U15'),
 array(['thing', 

In [13]:
tfidf.inverse_transform(X_test)

[array(['ì_', 'tmr', 'rite', 'project', 'll', 'da'], dtype='<U15'),
 array(['today'], dtype='<U15'),
 array(['test', 'tc', 'heart'], dtype='<U15'),
 array(['story', 'got', 'did'], dtype='<U15'),
 array(['wonder', 'text', 'll'], dtype='<U15'),
 array(['yup', 'school', 'said', 'reach', 'lunch', 'lor', 'home', 'dunno',
        'dad', 'coming', 'bring'], dtype='<U15'),
 array(['huh', 'facebook', 'didn'], dtype='<U15'),
 array(['winner', 'valued', 'valid', 'selected', 'prize', 'network',
        'hours', 'customer', 'code', 'claim', '12'], dtype='<U15'),
 array(['number', 'minute', 'line', 'fixed', 'direct', 'access'],
       dtype='<U15'),
 array(['treat', 'today', 'sorry', 'number', 'miss', 'lt', 'loving', 'log',
        'late', 'information', 'gt', 'friends', 'face', 'dont', 'details',
        'dear', 'book', 'birthday', 'ar'], dtype='<U15'),
 array(['going'], dtype='<U15'),
 array(['wont'], dtype='<U15'),
 array(['www', 'uk', 'stop', 'services', 'reply', 'messages', 'discount',
        

### ml model fit(train) => predict(test) => accuracy check

In [14]:
SA_lr = LogisticRegression(random_state = 0)

In [15]:
SA_lr.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [16]:
test_predict = SA_lr.predict(X_test)

In [17]:
print('스팸 메일 분석 정확도 : ', round(accuracy_score(y_test, test_predict), 5))

스팸 메일 분석 정확도 :  0.96881


### one mail => predict!!

In [18]:
sp = df['v2'][15]

In [19]:
sp2 = re.sub(r'[^[a-zA-Z]*$', "", sp)
sp2

'XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL'

In [20]:
sp_tfidf = tfidf.transform([sp2])
sp_tfidf

<1x1211 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [21]:
sp2_predict = SA_lr.predict(sp_tfidf)
sp2_predict, sp2_predict[0]

(array(['spam'], dtype=object), 'spam')

In [22]:
if(sp2_predict[0] == 'spam'):
    print(sp2, '==> spam mail')
else:
    print(sp2, '==> ham mail')

XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL ==> spam mail


### 모델 저장

In [23]:
import pickle
import joblib

In [24]:
tfidf_fit = tfidf.fit(df['v2'])
tfidf_fit

TfidfVectorizer(min_df=5, stop_words='english')

In [25]:
saved_model = pickle.dumps(SA_lr)

In [26]:
lr_from_pickle = pickle.loads(saved_model)
lr_from_pickle.predict(sp_tfidf[0])

array(['spam'], dtype=object)

In [27]:
joblib.dump(SA_lr, 'spam_SA_lr.pkl')

['spam_SA_lr.pkl']

In [28]:
lr_from_joblib = joblib.load('spam_SA_lr.pkl')
lr_from_joblib

LogisticRegression(random_state=0)

In [29]:
lr_from_joblib.predict(sp_tfidf[0])

array(['spam'], dtype=object)

### 모듈화

In [30]:
def model_vectorizing(vectorizer, model, X, y,):
    
    # vectorizer = tfidVectorizer or CountVectorizer
    # If you want to add another model, recommend importing it.
    # model = SA_lr(= LogisticRegression(random_state))
    
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)
    train_tfidf = vectorizer.fit_transform(X_train)
    test_tfidf = vectorizer.transform(X_test)
    
    SA_lr.fit(train_tfidf, y_train)
    print('스팸처리 정확도 : ', round(accuracy_score(y_test, test_predict), 2))

def one_mail_predict_df(x):
    import re
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    
    content = re.sub(r'[^[a-zA-Z]*$', "", x)
    content_tfidf = tfidf.transform([content])
    # SA_lr = LogisticRegression(random_state)
    content_predict = SA_lr.predict(content_tfidf)
    if(content_predict[0] == 'spam'):
        print(content, '==> spam mail')
    else:
        print(content, '==> ham mail')
    

def one_mail_predict_input():
    import re
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    
    content_raw = input('분류하려는 메일의 내용을 입력하세요 >> ')
    content = re.sub(r'[^[a-zA-Z]*$', "", content_raw)
    content_tfidf = tfidf.transform([content])
    # SA_lr = LogisticRegression(random_state)
    content_predict = SA_lr.predict(content_tfidf)
    if(content_predict[0] == 'spam'):
        print(content, '==> spam mail')
    else:
        print(content, '==> ham mail')
    
def save_dump(model):
    name = input("파일명>>")
    import pickle 
    import joblib
    saved_model = pickle.dumps(model)
    lr_from_pickle = pickle.loads(saved_model)
    joblib.dump(SA_lr, name+'.pkl')

In [31]:
model_vectorizing(tfidf, SA_lr, df['v2'], df['v1'])

스팸처리 정확도 :  0.97


In [33]:
lr_from_joblib = joblib.load('spam_module.pkl')
lr_from_joblib

LogisticRegression(random_state=0)

In [34]:
lr_from_joblib.predict(sp_tfidf[0])

array(['spam'], dtype=object)

In [35]:
one_mail_predict_df(df['v2'][15])

XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL ==> spam mail


In [36]:
one_mail_predict_input()

분류하려는 메일의 내용을 입력하세요 >> XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL
XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL ==> spam mail


In [37]:
save_dump(SA_lr)

파일명>>spam_SA_lr
