In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from __future__ import division

import base64
import csv
import gzip
import zlib
import os

import sklearn
import sklearn.preprocessing
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import numpy as np
from tqdm import tqdm

from collections import namedtuple

# My files
from decorators import *
from htmlparser import HtmlInfo, html2info_parser, html2info_bs_visible
from featureextractor import Features, calc_features, easy_tokenizer, pymorphy_tokenizer
from tokens import read_words, write_words, get_most_frequent_words, get_word2idx

%matplotlib inline
import matplotlib.pyplot as plt

# Загрузка данных

In [7]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'features'])

def load_csv(input_file_name, calc_features_f):    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in tqdm(enumerate(input_file)):
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)          
            yield DocItem(url_id, mark, url, html_data)

In [8]:
%%time

DATA_DIR = '/Users/zerogerc/Documents/datasets/antispam'
FILE_TRAIN = os.path.join(DATA_DIR, 'kaggle_train_data_tab.csv.gz')
FILE_TEST = os.path.join(DATA_DIR, 'kaggle_test_data_tab.csv.gz')

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 24.1 µs


In [9]:
train_docs = list(load_csv(FILE_TRAIN, calc_features))

7044it [00:05, 1342.86it/s]


In [10]:
test_docs = list(load_csv(FILE_TEST, calc_features))

16039it [00:12, 1269.78it/s]


# Классификатор

In [11]:
class Classifier:
    
    def __init__(self):
        m1 = CountVectorizer()
        m2 = TfidfTransformer()
        m3 = SGDClassifier()

        self.model = Pipeline([('count', m1), ('tfidf', m2), ('sgd', m3)])

#     def get_X(self, docs):
#         X = [doc.features for doc in docs]
#         X = np.array(X)
#         X = sklearn.preprocessing.Normalizer().fit_transform(X)
#         return X
    
#     def get_X(self, docs, cache):
#         X = []
#         for doc in docs:
#             if not (doc.doc_id in cache):
#                 current = np.zeros(WORDS_COUNT)
#                 for w in doc.features.words():
#                     if w in word2idx:
#                         current[word2idx[w]] += 1
#                 Xs[doc.doc_id] = current
                    
#             X.append(Xs[doc.doc_id])
            
#         X = np.array(X)
#         X = sklearn.preprocessing.Normalizer().fit_transform(X)
#         return X
    
    def get_X(self, docs):
        X = []
        for doc in docs:
            X.append(doc.features)
#             X.append(' '.join(filter(lambda x : len(x) > 2, doc.features.words())))
        return X
    
    def get_Y(self, docs):
        Y = [1 if doc.is_spam else 0 for doc in train_docs]
        return np.array(Y)                 
    
    def predict_all(self, docs):
        X = self.get_X(docs)
        
        predicted = self.model.predict(X)
        
        res = []
        for i, doc in enumerate(docs):
            res.append((doc.doc_id, predicted[i]))
        return res
    
    def train(self, docs):
        X = self.get_X(docs)
        Y = self.get_Y(docs)
        self.model.fit(X, Y)

# Обучение и запись результата

In [14]:
classifier = Classifier()
classifier.train(train_docs)

In [16]:
test_url = 'mail.ru'
test_html = '<title>Mail</title> <body>mail mail mail</body>'
classifier.predict_all([DocItem(0, 0, test_url, test_html)])

[(0, 0)]

In [17]:
with open('my_submission.csv' , 'wb') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for item in classifier.predict_all(test_docs):
        writer.writerow([item[0], item[1]])