# TF-IDF + RidgeClassifierで文書データの機械学習
Ref: https://www.kaggle.com/him4318/avito-lightgbm-with-ridge-feature-v-2-0/code

## Modules

In [None]:
import time
import numpy as np
import pandas as pd
import os
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import auc

# TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

## Settings

In [None]:
fold_num = 4
seed = 7
valid = False

## Class

In [None]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

## Methods

In [1]:
def get_oof(clf, x_train, y, x_test, len_train, len_test):
    oof_train = np.zeros((len_train,))
    oof_test = np.zeros((len_test,))
    oof_test_skf = np.empty((fold_num, len_test))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [None]:
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [None]:
def get_col(col_name): 
    return lambda x: x[col_name]

## Processings

### Preprocessings

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sub_df = pd.read_csv("sample_submission.csv")

In [None]:
train_test_df = train_df.append(test_df)
train_test_df["comment_text"] = train_test_df["comment_text"].str.fillna("no_desc")

In [None]:
len_train = len(train_df)
len_test = len(test_df)

In [None]:
kf = KFold(len_train, n_folds=fold_num, shuffle=True, random_state=seed)

In [None]:
train_test_df["comment_text"] = train_test_df["comment_text"].map(lambda text: cleanName(text))

### TF-IDF feature extraction

In [None]:
import nltk

nltk.download('stopwords') # 初回のみ実行
en_stop = set(stopwords.words('english'))

In [None]:
tfidf_para = {
    "stop_words": en_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}

In [None]:
# 複数の特徴量（文書データ）を学習させたい場合はFeatureUnionパイプラインを使う
vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                             max_features=17000, 
                             **tfidf_para, 
                             preprocessor=get_col('comment_text'))

In [None]:
tfidf_df = vectorizer.fit_transform(train_test_df[["comment_text"]].to_dict('records'))
tfidf_vocab = vectorizer.get_feature_names()

### Ridge classification

In [None]:
# Train-test split
train_df = train_test_df[~train_test_df["toxic"].isnull()]
test_df = train_test_df[train_test_df["toxic"].isnull()]
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [None]:
ridge_params = {'alpha':20.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':seed}

In [None]:
ridge = SklearnWrapper(clf=RidgeClassifier, seed = seed, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, 
                                          tfidf_df[:len_train],
                                          train_df["toxic"], 
                                          tfidf_df[len_train:],
                                          len_train,
                                          len_test)

In [None]:
ridge_auc = auc(train_df["toxic"], ridge_oof_train)
print('Ridge OOF AUC: {}'.format(ridge_auc))

In [None]:
test_df["toxic"] = ridge_oof_test.reshape(-1,)
test_df